get_responseXML returns an empty object

fedroponce · Dec 1, 2018

I am coding a simple application which will consume RSS feeds. The aplication uses WinInet to connect & retrieve the RSS Feed, then it will create a DOM document to extract information out of it.

Although I have come up with code that does what I want, I have found that for some RSS feeds it will fail by creating an DOM document wich is empty... this is a little test app I created to ilustrate the issue:

#import "msxml3.dll"
using namespace MSXML2; // For Msxml3.dll.

#include "stdio.h"
#include "tchar.h"
#include <regex>

using namespace std;
using namespace std::tr1;
#define URL_SIZE 2048
#define TITLE_SIZE 1024
#define DILBERT_RSS_FEED "Stick Figure Hamlet"
//#define DILBERT_RSS_FEED "http://www.phdcomics.com/gradfeed_justcomics.php"
//#define DILBERT_RSS_FEED "Dilbert Daily Strip"

//#pragma comment(lib, "rpcrt4")

HRESULT __fastcall UnicodeToAnsi(LPCOLESTR pszW, LPSTR* ppszA)
{

ULONG cbAnsi, cCharacters;
DWORD dwError;

// If input is null then just return the same.
if (pszW == NULL)
{
*ppszA = NULL;
return NOERROR;
}

cCharacters = wcslen(pszW)+1;
// Determine number of bytes to be allocated for ANSI string. An
// ANSI string can have at most 2 bytes per character (for Double
// Byte Character Strings.)
cbAnsi = cCharacters*2;

// Use of the OLE allocator is not required because the resultant
// ANSI string will never be passed to another COM component. You
// can use your own allocator.
*ppszA = (LPSTR) CoTaskMemAlloc(cbAnsi);
if (NULL == *ppszA)
return E_OUTOFMEMORY;

// Convert to ANSI.
if (0 == WideCharToMultiByte(CP_ACP, 0, pszW, cCharacters, *ppszA,
cbAnsi, NULL, NULL))
{
dwError = GetLastError();
CoTaskMemFree(*ppszA);
*ppszA = NULL;
return HRESULT_FROM_WIN32(dwError);
}

return NOERROR;
}

HRESULT __fastcall AnsiToUnicode(LPCSTR pszA, LPWSTR* ppszW)
{
ULONG cCharacters;
DWORD dwError;

// If input is null then just return the same.
if (NULL == pszA)
{
*ppszW = NULL;
return NOERROR;
}

// Determine number of wide characters to be allocated for the
// Unicode string.
cCharacters = strlen(pszA)+1;

// Use of the OLE allocator is required if the resultant Unicode
// string will be passed to another COM component and if that
// component will free it. Otherwise you can use your own allocator.
*ppszW = (LPOLESTR) CoTaskMemAlloc(cCharacters*2);
if (NULL == *ppszW)
return E_OUTOFMEMORY;

// Covert to Unicode.
if (0 == MultiByteToWideChar(CP_ACP, 0, pszA, cCharacters, *ppszW, cCharacters))
{
dwError = GetLastError();
CoTaskMemFree(*ppszW);
*ppszW = NULL;
return HRESULT_FROM_WIN32(dwError);
}

return NOERROR;
}

void dump_com_error(_com_error &e)
{
printf("Error\n");
printf("\a\tCode = %08lx\n", e.Error());
printf("\a\tCode meaning = %s", e.ErrorMessage());
_bstr_t bstrSource(e.Source());
_bstr_t bstrDescription(e.Description());
printf("\a\tSource = %s\n", (LPCSTR) bstrSource);
printf("\a\tDescription = %s\n", (LPCSTR) bstrDescription);
}

int main()
{
HRESULT hr = CoInitializeEx(NULL, COINIT_APARTMENTTHREADED);
if (hr != S_OK)
return 0;

bool encounteredError = false;

IXMLHTTPRequestPtr pXMLHTTPReq = NULL;
MSXML2::IXMLDOMDocumentPtr pXMLDocPtr = NULL;
MSXML2::IXMLDOMNodeListPtr pItemNodeList = NULL;
MSXML2::IXMLDOMElementPtr pItemElement = NULL;

try
{
// Create an XMLHTTPRequest object to request the feed
hr = pXMLHTTPReq.CreateInstance(__uuidof(MSXML2::XMLHTTP30));
if (FAILED(hr))
throw hr;

// open the request
hr = pXMLHTTPReq->open(_bstr_t(_T("GET")), _bstr_t(DILBERT_RSS_FEED), _variant_t(VARIANT_TRUE));
//hr = pXMLHTTPReq->open(_bstr_t(_T("GET")), _bstr_t(DILBERT_RSS_FEED), _variant_t(VARIANT_FALSE));
if (FAILED(hr))
throw hr;

// Set the headers
hr = pXMLHTTPReq->setRequestHeader(_bstr_t(_T("charset")), _bstr_t(_T("UTF-8")));
if (FAILED(hr))
throw hr;

// Send the request
hr = pXMLHTTPReq->send(NULL);
if (FAILED(hr))
throw hr;

long readyState = READYSTATE_UNINITIALIZED;
MSG msg;
while (readyState != READYSTATE_COMPLETE)
{
// Without this message pump, readyState does not change.
if (PeekMessage(&msg, 0, 0 ,0, PM_REMOVE))
{
TranslateMessage(&msg);
DispatchMessage(&msg);
}

readyState = pXMLHTTPReq->GetreadyState();
}

long nStatus = 0;
hr = pXMLHTTPReq->get_status(&nStatus);
if (FAILED(hr))
throw hr;

// Process the feed if the response was received successfully
if (nStatus == 200)
{
// Retrieve the RSS XML DOM Document to process the RSS
// Feed results and extract the comic strip's images info

// Retrieve the XML DOM Document from the response
BSTR bstrString = NULL;
hr = pXMLHTTPReq->get_responseText(&bstrString);
printf("Response Body:\r\n%S\r\n", bstrString);

hr = pXMLHTTPReq->get_responseXML((IDispatch **) &pXMLDocPtr);
if (FAILED(hr))
throw hr;

BSTR bstrXMLDoc = NULL;
pXMLDocPtr->get_text(&bstrXMLDoc);
printf("XML Response:\r\n%S\r\n", bstrXMLDoc);

// Retrieve the list of "item" elements
pItemNodeList = pXMLDocPtr->getElementsByTagName(_bstr_t(_T("item")));
if (FAILED(pItemNodeList))
throw hr;

//Here, if we're in error pDomNode is NULL
if (pItemNodeList != NULL)
{
long nItems = 0;
hr = pItemNodeList->get_length(&nItems);
if (FAILED(hr))
throw hr;

for (int i = 0; (i < (int)nItems) && (encounteredError == false); i++)
{
WCHAR rssTitle[TITLE_SIZE];
WCHAR rssLink[URL_SIZE];
WCHAR rssComicURL[URL_SIZE];
WCHAR rssComicFileName[MAX_PATH+1];

// Retrieve the ith item element
pItemElement = pItemNodeList->item;
if (pItemElement != NULL)
{
// Retrieve the title text
_tcscpy_s(rssTitle, pItemElement->firstChild->text);

// Retrieve the link element
MSXML2::IXMLDOMNodeListPtr pLinkNodes = NULL;
MSXML2::IXMLDOMElementPtr pLinkElement = NULL;

pLinkNodes = pItemElement->getElementsByTagName(_T("link"));
if (pLinkNodes != NULL)
{
long nLinkElements = 0;
hr = pLinkNodes->get_length(&nLinkElements);
if (FAILED(hr))
throw hr;

if (nLinkElements == 1)
{
pLinkElement = pLinkNodes->item[0];

// Retrieve the link
if (pLinkElement != NULL)
_tcscpy_s(rssLink, pLinkElement->text);
}
}

// Retrieve the description element
MSXML2::IXMLDOMNodeListPtr pSummaryNodes = NULL;
MSXML2::IXMLDOMElementPtr pSummaryElement = NULL;

pSummaryNodes = pItemElement->getElementsByTagName(_bstr_t(_T("description")));
if (pSummaryNodes != NULL)
{
long nSummaryElements = 0;
hr = pSummaryNodes->get_length(&nSummaryElements);
if (FAILED(hr))
throw hr;

if (nSummaryElements == 1)
{
pSummaryElement = pSummaryNodes->item[0];

// Retrieve the description
if (pSummaryElement != NULL)
{
LPSTR szRssSummary;
CHAR szRssComicURL[MAX_PATH+1];
UnicodeToAnsi(pSummaryElement->text, &szRssSummary);

// Retrieve the image URL
const regex imageurl("\\b(https?|ftp)://([-a-zA-Z0-9.]+)(/[-a-zA-Z0-9+&@#/%=~_|!:,.;]*)?(gif|png|jpg)");
cmatch matches;

if (regex_search(szRssSummary, matches, imageurl))
{
strcpy_s(szRssComicURL, matches[0].str().c_str());
#ifdef _UNICODE
LPWSTR pszRssComicURL;
AnsiToUnicode(szRssComicURL, &pszRssComicURL);
_tcscpy_s(rssComicURL, URL_SIZE, pszRssComicURL);
#else
_tcscpy_s(rssComicURL, URL_SIZE, matches[0].str().c_str());
#endif
}
else
{
_tcscpy_s(rssComicURL, URL_SIZE, _T("Not found"));
}

//const regex imageFileName("\\b(https?|ftp)://([-a-zA-Z0-9.]+)(/[-a-zA-Z0-9+&@#/%=~_|!:,.;]*)?(gif|png|jpg)");
const regex imageFileName("[\\w_.-]*?(?=\\?)|[\\w_.-]*$");
if (regex_search(szRssComicURL, matches, imageFileName))
{
#ifdef _UNICODE
LPWSTR pszImageFileName;
AnsiToUnicode(matches[0].str().c_str(), &pszImageFileName);
_tcscpy_s(rssComicFileName, MAX_PATH+1, pszImageFileName);
#else
_tcscpy_s(rssComicFileName, MAX_PATH+1, matches[0].str().c_str());
#endif
}
else
{
_tcscpy_s(rssComicFileName, MAX_PATH+1, _T("Not found"));
}
}
}
}
}
else
{
encounteredError = true;
}
}
encounteredError = false;
}
else
{
encounteredError = true;
}
}
else
{
encounteredError = true;
//cout << "Error selecting XML single node" ;
}
}
catch(_com_error &e)
{
bool encounteredError = true;
dump_com_error(e);
}

return 0;
}

If I run it for Dilbert Daily Strip it would retrieve & parse the feed without problems... but for Stick Figure Hamlet the call to get_responseXML and then pXMLDocPtr->get_text(&bstrXMLDoc) will result in an empty string... although the call to pXMLHTTPReq->get_responseText(&bstrString) does retrive the response (XML document).

I can not spot the problem with my code... any ideas?

Continue reading...

get_responseXML returns an empty object

fedroponce

Guest

Similar threads