EDN Admin
Well-known member
hi,all, I am trying to print out all the HTML DOM nodes from a page and save the result in a xml doc, here is my program:
<div style="color:Black;background-color:White; <pre>
<span style="color:Blue; void CHTMLBrowserDlg:rocessHTMLDoc(MSHTML::IHTMLDocument2Ptr pDoc)
{
MSHTML::IHTMLDocument3Ptr pDoc3 = NULL;
MSHTML::IHTMLElementPtr pHtmlElem = NULL;
MSHTML::IHTMLElementPtr pElemBody = NULL;
MSXML2::IXMLDOMDocumentPtr pXMLDoc = NULL;
MSXML2::IXMLDOMElementPtr pHtmlXMLDomElem = NULL;
BSTR bstrTagName;
HRESULT hr;
hr = pDoc->QueryInterface(IID_IHTMLDocument3, (<span style="color:Blue; void**)&pDoc3);
<span style="color:Blue; if(SUCCEEDED(hr))
{
pHtmlElem = pDoc3->documentElement;
}
hr = pDoc->get_body(&pElemBody);
<span style="color:Blue; if(SUCCEEDED(hr))
{
HRESULT hr2 = pXMLDoc.CreateInstance(__uuidof(MSXML2:OMDocument60), NULL, CLSCTX_INPROC_SERVER);
<span style="color:Blue; if(FAILED(hr2))
{
AfxMessageBox(L<span style="color:#A31515; "Failed to instantiate an XML DOM.");
<span style="color:Blue; return;
}
<span style="color:Blue; if(pHtmlElem->get_tagName(&bstrTagName) == S_OK)
{
pHtmlXMLDomElem = pXMLDoc->createElement(bstrTagName);
}
MSHTML::IHTMLDOMNodePtr pHTMLDOMNode =NULL;
MSHTML::IHTMLDOMNodePtr pHTMLBodyDOMNode =NULL;
MSXML2::IXMLDOMElementPtr pHtmlXmlDomElem = NULL;
BSTR bstrNodeName;
hr = pHtmlElem->QueryInterface(IID_IHTMLDOMNode,(<span style="color:Blue; void**)&pHTMLDOMNode);
<span style="color:Blue; if(hr == S_OK)
{
bstrNodeName = pHTMLDOMNode->nodeName;
pHtmlXmlDomElem = pXMLDoc->createElement(bstrNodeName);
hr = pElemBody->QueryInterface(IID_IHTMLDOMNode,(<span style="color:Blue; void**)&pHTMLBodyDOMNode);
<span style="color:Blue; if(hr == S_OK)
{
ProcessDomNodeSmartWrapper(pHTMLBodyDOMNode, pXMLDoc, pHtmlXmlDomElem);
}
pXMLDoc->appendChild(pHtmlXmlDomElem);
<span style="color:Blue; if(DOMDocSaveLocation(pXMLDoc))
AfxMessageBox(_T(<span style="color:#A31515; "XML DOM Document saved successfully"));
}
}
}
[/code]
<div style="color:Black;background-color:White; <pre>
<span style="color:Blue; void CHTMLBrowserDlg:rocessDomNodeSmartWrapper(MSHTML::IHTMLDOMNodePtr pNode, MSXML2::IXMLDOMDocumentPtr pXMLDoc, MSXML2::IXMLDOMElementPtr pParentXMLDOMElem)
{
<span style="color:Blue; if(pNode->nodeType == 1)
{
MSXML2::IXMLDOMElementPtr pXMLElem = pXMLDoc->createElement(pNode->nodeName);
pParentXMLDOMElem->appendChild(pXMLElem);
MSHTML::IHTMLDOMChildrenCollectionPtr pChElemColl = MSHTML::IHTMLDOMChildrenCollectionPtr(pNode->childNodes);
<span style="color:Blue; long lLength = pChElemColl->length;
<span style="color:Blue; for(<span style="color:Blue; int i = 0; i < lLength; i++)
{
ProcessDomNodeSmartWrapper(MSHTML::IHTMLDOMNodePtr(pChElemColl->item(i)), pXMLDoc, pXMLElem);
}
}<span style="color:Blue; else <span style="color:Blue; if(pNode->nodeType == 3){
VARIANT varNodeVal;
HRESULT hr;
hr = pNode->get_nodeValue(&varNodeVal);
<span style="color:Blue; if(hr == S_OK)
{
pParentXMLDOMElem->text = (pNode->nodeValue).bstrVal;
}
}
}
[/code]
a sample html source is like this:
<div style="color:Black;background-color:White; <pre>
<span style="color:Blue; <<span style="color:#A31515; html <span style="color:Red; xmlns<span style="color:Blue; =<span style="color:Blue; "http://www.w3.org/1999/xhtml" <span style="color:Blue; >
<span style="color:Blue; <<span style="color:#A31515; body <span style="color:Red; class<span style="color:Blue; =<span style="color:Blue; "no-js"<span style="color:Blue; >
<span style="color:Blue; <<span style="color:#A31515; script <span style="color:Red; type<span style="color:Blue; =<span style="color:Blue; "text/javascript"<span style="color:Blue; >document.getElementsByTagName(<span style="color:#A31515; "body")[0].className=<span style="color:#A31515; "js";<span style="color:Blue; </<span style="color:#A31515; script<span style="color:Blue; >
<span style="color:Blue; <<span style="color:#A31515; div <span style="color:Red; id<span style="color:Blue; =<span style="color:Blue; "ctl00_ctl00_pnlOmniture"<span style="color:Blue; >
<span style="color:Blue; <<span style="color:#A31515; input <span style="color:Red; name<span style="color:Blue; =<span style="color:Blue; "ctl00$ctl00$hdnLoggedIn" <span style="color:Red; type<span style="color:Blue; =<span style="color:Blue; "hidden" <span style="color:Red; id<span style="color:Blue; =<span style="color:Blue; "ctl00_ctl00_hdnLoggedIn" <span style="color:Blue; />
<span style="color:Blue; <<span style="color:#A31515; script <span style="color:Red; language<span style="color:Blue; =<span style="color:Blue; "JavaScript" <span style="color:Red; type<span style="color:Blue; =<span style="color:Blue; "text/javascript" <span style="color:Red; src<span style="color:Blue; =<span style="color:Blue; "/bmi/src/js/v7/s_code.js"<span style="color:Blue; ><span style="color:Blue; </<span style="color:#A31515; script<span style="color:Blue; >
<span style="color:Blue; <<span style="color:#A31515; script <span style="color:Red; language<span style="color:Blue; =<span style="color:Blue; "JavaScript" <span style="color:Red; type<span style="color:Blue; =<span style="color:Blue; "text/javascript" <span style="color:Red; src<span style="color:Blue; =<span style="color:Blue; "/bmi/src/js/v7/omniture.js"<span style="color:Blue; ><span style="color:Blue; </<span style="color:#A31515; script<span style="color:Blue; >
<span style="color:Green; <!-- SiteCatalyst code version: H.19.4.
Copyright 1997-2009 Omniture, Inc. More info available at
http://www.omniture.com -->
<span style="color:Blue; <<span style="color:#A31515; script <span style="color:Red; language<span style="color:Blue; =<span style="color:Blue; "JavaScript" <span style="color:Red; type<span style="color:Blue; =<span style="color:Blue; "text/javascript"<span style="color:Blue; >
<!--<span style="color:Green; /* You may give each page an identifying name, server, and channel onthe next lines. */
<span style="color:Green; /********* FOR ALL PAGES *********/
s.pageName=<span style="color:#A31515; homepage;s.server=<span style="color:#A31515; "www.flybmi.com";s.channel=<span style="color:#A31515; homepage;
<span style="color:Green; /********* FOR ALL PAGES *********/
<span style="color:Green; /************* DO NOT ALTER ANYTHING BELOW THIS LINE ! **************/
<span style="color:Blue; var s_code=s.t();<span style="color:Blue; if(s_code)document.write(s_code)<span style="color:Green; //--><span style="color:Blue; </<span style="color:#A31515; script<span style="color:Blue; >
<span style="color:Blue; <<span style="color:#A31515; script <span style="color:Red; language<span style="color:Blue; =<span style="color:Blue; "JavaScript" <span style="color:Red; type<span style="color:Blue; =<span style="color:Blue; "text/javascript"<span style="color:Blue; ><!--
<span style="color:Blue; if(navigator.appVersion.indexOf(<span style="color:#A31515; MSIE)>=0)document.write(unescape(<span style="color:#A31515; %3C)+<span style="color:#A31515; !-+<span style="color:#A31515; -)
<span style="color:Green; //--><span style="color:Blue; </<span style="color:#A31515; script<span style="color:Blue; ><span style="color:Blue; <<span style="color:#A31515; noscript<span style="color:Blue; ><span style="color:Blue; <<span style="color:#A31515; a <span style="color:Red; href<span style="color:Blue; =<span style="color:Blue; "http://www.omniture.com" <span style="color:Red; title<span style="color:Blue; =<span style="color:Blue; "Web Analytics"<span style="color:Blue; >
<span style="color:Blue; <<span style="color:#A31515; img <span style="color:Red; src<span style="color:Blue; =<span style="color:Blue; "http://metrics.flybmi.com/b/ss/flybmicomprod/1/H.19.4--NS/0" <span style="color:Red; id<span style="color:Blue; =<span style="color:Blue; "ctl00_ctl00_noScriptImgTag" <span style="color:Red; height<span style="color:Blue; =<span style="color:Blue; "1" <span style="color:Red; width<span style="color:Blue; =<span style="color:Blue; "1" <span style="color:Red; border<span style="color:Blue; =<span style="color:Blue; "0" <span style="color:Blue; /><span style="color:Blue; </<span style="color:#A31515; a<span style="color:Blue; ><span style="color:Blue; </<span style="color:#A31515; noscript<span style="color:Blue; ><span style="color:Green; <!--/DO NOT REMOVE/-->
<span style="color:Green; <!-- End SiteCatalyst code version: H.19.4. -->
<span style="color:Blue; </<span style="color:#A31515; div<span style="color:Blue; >
<span style="color:Blue; </<span style="color:#A31515; body<span style="color:Blue; >
<span style="color:Blue; </<span style="color:#A31515; html<span style="color:Blue; >
[/code]
now the odd behaviour is that some nodes are not actually appended to the parent xml node when i walked down the html dom tree top-down, e.g. the first 3 children (input, script and script) of div element are missing in the result xml doc. so, whats
wrong with the code?
cheers
daiyue
View the full article
<div style="color:Black;background-color:White; <pre>
<span style="color:Blue; void CHTMLBrowserDlg:rocessHTMLDoc(MSHTML::IHTMLDocument2Ptr pDoc)
{
MSHTML::IHTMLDocument3Ptr pDoc3 = NULL;
MSHTML::IHTMLElementPtr pHtmlElem = NULL;
MSHTML::IHTMLElementPtr pElemBody = NULL;
MSXML2::IXMLDOMDocumentPtr pXMLDoc = NULL;
MSXML2::IXMLDOMElementPtr pHtmlXMLDomElem = NULL;
BSTR bstrTagName;
HRESULT hr;
hr = pDoc->QueryInterface(IID_IHTMLDocument3, (<span style="color:Blue; void**)&pDoc3);
<span style="color:Blue; if(SUCCEEDED(hr))
{
pHtmlElem = pDoc3->documentElement;
}
hr = pDoc->get_body(&pElemBody);
<span style="color:Blue; if(SUCCEEDED(hr))
{
HRESULT hr2 = pXMLDoc.CreateInstance(__uuidof(MSXML2:OMDocument60), NULL, CLSCTX_INPROC_SERVER);
<span style="color:Blue; if(FAILED(hr2))
{
AfxMessageBox(L<span style="color:#A31515; "Failed to instantiate an XML DOM.");
<span style="color:Blue; return;
}
<span style="color:Blue; if(pHtmlElem->get_tagName(&bstrTagName) == S_OK)
{
pHtmlXMLDomElem = pXMLDoc->createElement(bstrTagName);
}
MSHTML::IHTMLDOMNodePtr pHTMLDOMNode =NULL;
MSHTML::IHTMLDOMNodePtr pHTMLBodyDOMNode =NULL;
MSXML2::IXMLDOMElementPtr pHtmlXmlDomElem = NULL;
BSTR bstrNodeName;
hr = pHtmlElem->QueryInterface(IID_IHTMLDOMNode,(<span style="color:Blue; void**)&pHTMLDOMNode);
<span style="color:Blue; if(hr == S_OK)
{
bstrNodeName = pHTMLDOMNode->nodeName;
pHtmlXmlDomElem = pXMLDoc->createElement(bstrNodeName);
hr = pElemBody->QueryInterface(IID_IHTMLDOMNode,(<span style="color:Blue; void**)&pHTMLBodyDOMNode);
<span style="color:Blue; if(hr == S_OK)
{
ProcessDomNodeSmartWrapper(pHTMLBodyDOMNode, pXMLDoc, pHtmlXmlDomElem);
}
pXMLDoc->appendChild(pHtmlXmlDomElem);
<span style="color:Blue; if(DOMDocSaveLocation(pXMLDoc))
AfxMessageBox(_T(<span style="color:#A31515; "XML DOM Document saved successfully"));
}
}
}
[/code]
<div style="color:Black;background-color:White; <pre>
<span style="color:Blue; void CHTMLBrowserDlg:rocessDomNodeSmartWrapper(MSHTML::IHTMLDOMNodePtr pNode, MSXML2::IXMLDOMDocumentPtr pXMLDoc, MSXML2::IXMLDOMElementPtr pParentXMLDOMElem)
{
<span style="color:Blue; if(pNode->nodeType == 1)
{
MSXML2::IXMLDOMElementPtr pXMLElem = pXMLDoc->createElement(pNode->nodeName);
pParentXMLDOMElem->appendChild(pXMLElem);
MSHTML::IHTMLDOMChildrenCollectionPtr pChElemColl = MSHTML::IHTMLDOMChildrenCollectionPtr(pNode->childNodes);
<span style="color:Blue; long lLength = pChElemColl->length;
<span style="color:Blue; for(<span style="color:Blue; int i = 0; i < lLength; i++)
{
ProcessDomNodeSmartWrapper(MSHTML::IHTMLDOMNodePtr(pChElemColl->item(i)), pXMLDoc, pXMLElem);
}
}<span style="color:Blue; else <span style="color:Blue; if(pNode->nodeType == 3){
VARIANT varNodeVal;
HRESULT hr;
hr = pNode->get_nodeValue(&varNodeVal);
<span style="color:Blue; if(hr == S_OK)
{
pParentXMLDOMElem->text = (pNode->nodeValue).bstrVal;
}
}
}
[/code]
a sample html source is like this:
<div style="color:Black;background-color:White; <pre>
<span style="color:Blue; <<span style="color:#A31515; html <span style="color:Red; xmlns<span style="color:Blue; =<span style="color:Blue; "http://www.w3.org/1999/xhtml" <span style="color:Blue; >
<span style="color:Blue; <<span style="color:#A31515; body <span style="color:Red; class<span style="color:Blue; =<span style="color:Blue; "no-js"<span style="color:Blue; >
<span style="color:Blue; <<span style="color:#A31515; script <span style="color:Red; type<span style="color:Blue; =<span style="color:Blue; "text/javascript"<span style="color:Blue; >document.getElementsByTagName(<span style="color:#A31515; "body")[0].className=<span style="color:#A31515; "js";<span style="color:Blue; </<span style="color:#A31515; script<span style="color:Blue; >
<span style="color:Blue; <<span style="color:#A31515; div <span style="color:Red; id<span style="color:Blue; =<span style="color:Blue; "ctl00_ctl00_pnlOmniture"<span style="color:Blue; >
<span style="color:Blue; <<span style="color:#A31515; input <span style="color:Red; name<span style="color:Blue; =<span style="color:Blue; "ctl00$ctl00$hdnLoggedIn" <span style="color:Red; type<span style="color:Blue; =<span style="color:Blue; "hidden" <span style="color:Red; id<span style="color:Blue; =<span style="color:Blue; "ctl00_ctl00_hdnLoggedIn" <span style="color:Blue; />
<span style="color:Blue; <<span style="color:#A31515; script <span style="color:Red; language<span style="color:Blue; =<span style="color:Blue; "JavaScript" <span style="color:Red; type<span style="color:Blue; =<span style="color:Blue; "text/javascript" <span style="color:Red; src<span style="color:Blue; =<span style="color:Blue; "/bmi/src/js/v7/s_code.js"<span style="color:Blue; ><span style="color:Blue; </<span style="color:#A31515; script<span style="color:Blue; >
<span style="color:Blue; <<span style="color:#A31515; script <span style="color:Red; language<span style="color:Blue; =<span style="color:Blue; "JavaScript" <span style="color:Red; type<span style="color:Blue; =<span style="color:Blue; "text/javascript" <span style="color:Red; src<span style="color:Blue; =<span style="color:Blue; "/bmi/src/js/v7/omniture.js"<span style="color:Blue; ><span style="color:Blue; </<span style="color:#A31515; script<span style="color:Blue; >
<span style="color:Green; <!-- SiteCatalyst code version: H.19.4.
Copyright 1997-2009 Omniture, Inc. More info available at
http://www.omniture.com -->
<span style="color:Blue; <<span style="color:#A31515; script <span style="color:Red; language<span style="color:Blue; =<span style="color:Blue; "JavaScript" <span style="color:Red; type<span style="color:Blue; =<span style="color:Blue; "text/javascript"<span style="color:Blue; >
<!--<span style="color:Green; /* You may give each page an identifying name, server, and channel onthe next lines. */
<span style="color:Green; /********* FOR ALL PAGES *********/
s.pageName=<span style="color:#A31515; homepage;s.server=<span style="color:#A31515; "www.flybmi.com";s.channel=<span style="color:#A31515; homepage;
<span style="color:Green; /********* FOR ALL PAGES *********/
<span style="color:Green; /************* DO NOT ALTER ANYTHING BELOW THIS LINE ! **************/
<span style="color:Blue; var s_code=s.t();<span style="color:Blue; if(s_code)document.write(s_code)<span style="color:Green; //--><span style="color:Blue; </<span style="color:#A31515; script<span style="color:Blue; >
<span style="color:Blue; <<span style="color:#A31515; script <span style="color:Red; language<span style="color:Blue; =<span style="color:Blue; "JavaScript" <span style="color:Red; type<span style="color:Blue; =<span style="color:Blue; "text/javascript"<span style="color:Blue; ><!--
<span style="color:Blue; if(navigator.appVersion.indexOf(<span style="color:#A31515; MSIE)>=0)document.write(unescape(<span style="color:#A31515; %3C)+<span style="color:#A31515; !-+<span style="color:#A31515; -)
<span style="color:Green; //--><span style="color:Blue; </<span style="color:#A31515; script<span style="color:Blue; ><span style="color:Blue; <<span style="color:#A31515; noscript<span style="color:Blue; ><span style="color:Blue; <<span style="color:#A31515; a <span style="color:Red; href<span style="color:Blue; =<span style="color:Blue; "http://www.omniture.com" <span style="color:Red; title<span style="color:Blue; =<span style="color:Blue; "Web Analytics"<span style="color:Blue; >
<span style="color:Blue; <<span style="color:#A31515; img <span style="color:Red; src<span style="color:Blue; =<span style="color:Blue; "http://metrics.flybmi.com/b/ss/flybmicomprod/1/H.19.4--NS/0" <span style="color:Red; id<span style="color:Blue; =<span style="color:Blue; "ctl00_ctl00_noScriptImgTag" <span style="color:Red; height<span style="color:Blue; =<span style="color:Blue; "1" <span style="color:Red; width<span style="color:Blue; =<span style="color:Blue; "1" <span style="color:Red; border<span style="color:Blue; =<span style="color:Blue; "0" <span style="color:Blue; /><span style="color:Blue; </<span style="color:#A31515; a<span style="color:Blue; ><span style="color:Blue; </<span style="color:#A31515; noscript<span style="color:Blue; ><span style="color:Green; <!--/DO NOT REMOVE/-->
<span style="color:Green; <!-- End SiteCatalyst code version: H.19.4. -->
<span style="color:Blue; </<span style="color:#A31515; div<span style="color:Blue; >
<span style="color:Blue; </<span style="color:#A31515; body<span style="color:Blue; >
<span style="color:Blue; </<span style="color:#A31515; html<span style="color:Blue; >
[/code]
now the odd behaviour is that some nodes are not actually appended to the parent xml node when i walked down the html dom tree top-down, e.g. the first 3 children (input, script and script) of div element are missing in the result xml doc. so, whats
wrong with the code?
cheers
daiyue
View the full article