WATS WRONG WITH MY C# WEB DATA EXTRACTION CODE?

EDN Admin · May 2, 2012

<pre class="prettyprint using System;
using System.Collections.Generic;
using System.Collections.Concurrent;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
using System.Net;
using System.Threading.Tasks;

namespace BookInfoCrawler
{
class ChinaPubCrawler : DeepCrawler
{
public ChinaPubCrawler(String keyword)
: base(keyword)
{
}

public override void update()
{
String url = "http://search.china-pub.com/s/?key1=" + _keyword + "&type=&pz=1&t=2";
String pageContent = getPageContent(url);

if (getBookCount(pageContent) > 0)
{
getPageCount(pageContent);

DateTime dt1 = DateTime.Now;
if(true)
{
// use concurrent method simply
ConcurrentStack<String> pageData = new ConcurrentStack<String>();
Parallel.For(1, _webInfo._pageCount+1, pageIndex =>
{
String newUrl = url + "&page=" + pageIndex;
pageData.Push(getPageContent(newUrl));
});

while (pageData.TryPop(out pageContent))
{
foreach (string oneBook in getAllBook(pageContent))
{
extractBookInfo(oneBook);
}
}
}
else
{
// traditional method
for (int pageIndex = 1; pageIndex <= _webInfo._pageCount; ++pageIndex)
{
String newUrl = url + "&page=" + pageIndex;
pageContent = getPageContent(newUrl);

foreach (string oneBook in getAllBook(pageContent))
{
extractBookInfo(oneBook);
}
}
}

DateTime dt2 = DateTime.Now;
_costTime = (dt2 - dt1).TotalSeconds;
}

if (_webInfo._bookCount != _bookInfos.Count)
{
_webInfo._bookCount = _bookInfos.Count;
}
}

protected override int getBookCount(String pageContent)
{
try
{
String keyword = "break";
String pattern = generateTagPattern(keyword);
Regex reg = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Singleline);
MatchCollection mc = reg.Matches(pageContent);

if (mc.Count == 0)
{
Console.WriteLine("No matching book!");
}
else
{
String bookStuff = mc[0].Value;
// 使用 <span>xml 搜索，共有 <span>534 种商品
String bookPattern = "<span>([0-9]+?).+";
Regex bookReg = new Regex(bookPattern, RegexOptions.IgnoreCase);

_webInfo._bookCount = Convert.ToInt32(bookReg.Match(bookStuff).Groups[1].Value);

Console.WriteLine("OK, we got " + _webInfo._bookCount + " books now!");
}
}
catch (System.Exception webEx)
{
Console.WriteLine(webEx.Message.ToString());
}

return _webInfo._bookCount;
}

protected override int getPageCount(String pageContent)
{
_webInfo._pageCount = Convert.ToInt32(Math.Ceiling(_webInfo._bookCount / 20.0));
return _webInfo._pageCount;
}

protected override List<String> getAllBook(String pageContent)
{
List<String> allBooks = new List<String>();
try
{
String majorPattern = generateTagPattern("listview");
Regex majorReg = new Regex(majorPattern, RegexOptions.IgnoreCase | RegexOptions.Singleline);
Match majorMatch = majorReg.Match(pageContent);

// <pre><a target="_blank http://product.china-pub.com/29851
String pattern = "<pre><a target="_blank (.+?)"";
Regex reg = new Regex(pattern, RegexOptions.IgnoreCase);
MatchCollection mc = reg.Matches(majorMatch.Value);

if (mc.Count > 0)
{
//foreach (Match match in mc)
//{
// allBooks.Add(match.Value);
//}

// use concurrent method simply
ConcurrentStack<String> pageData = new ConcurrentStack<String>();
Parallel.For(0, mc.Count, idx =>
{
pageData.Push(getPageContent(mc[idx].Value));
});

String tmpPageContent;
while (pageData.TryPop(out tmpPageContent))
{
allBooks.Add(tmpPageContent);
}
}
else
{
Console.WriteLine("No matching content!");
}
}
catch (System.Exception webEx)
{
Console.WriteLine(webEx.Message.ToString());
}

return allBooks;
}

protected override void extractBookInfo(String bookContent)
{
BookInfo bookInfo;
bookInfo._name = String.Empty;
bookInfo._author = String.Empty;
bookInfo._description = String.Empty;
bookInfo._price = String.Empty;
bookInfo._discount = String.Empty;
bookInfo._webSite = "www.china-pub.com";

Match _match;

#region BookName
// <h1 class="black15c" id=js_shuming>Java Web开发详解--XML+XSLT+Servlet+JSP深入剖析与实例应用 (被《程序员》等机构评选为2006年最受读者喜爱的十大IT图书之一)</h1>
String keyword = "js_shuming";
String pattern = "<(?<HtmlTag>[\w]+)[^>]*\s[iI][dD]=(?<Quote>["]?)" + keyword + "(?(Quote)\k<Quote>)["]?[^>]*>((?<Nested><\k<HtmlTag>[^>]*>)|</\k<HtmlTag>>(?<-Nested>)|.*?)*</\k<HtmlTag>>"; ;
Regex reg = new Regex(pattern, RegexOptions.IgnoreCase | RegexOptions.Singleline);
MatchCollection mc = reg.Matches(bookContent);
bookInfo._name = mc[0].Value;
#endregion

#region BookAuthor
String mainPattern = generateTagPattern("lcon more-infos");
Regex mainReg = new Regex(mainPattern, RegexOptions.IgnoreCase);
Match mainMatch = mainReg.Match(bookContent);

String authorPattern = "<a href=.+? (.+?) ";
Regex authorReg = new Regex(authorPattern, RegexOptions.IgnoreCase);

mc = authorReg.Matches(bookContent);
foreach (Match match in mc)
{
bookInfo._author += match.Groups[1].Value + " ";
}
bookInfo._author = bookInfo._author.Trim();
#endregion

#region BookDescription
String descriPattern = generateTagPattern("neirong");
Regex descriReg = new Regex(descriPattern, RegexOptions.IgnoreCase | RegexOptions.Singleline);

_match = descriReg.Match(bookContent);
bookInfo._description = _match.Groups[1].Value;
bookInfo._description = bookInfo._description.Replace("<br />", String.Empty);
bookInfo._description = bookInfo._description.Replace("n", " ");
bookInfo._description = bookInfo._description.Trim();
#endregion

#region BookPrice
mainPattern = generateTagPattern("price-area");
mainReg = new Regex(mainPattern, RegexOptions.IgnoreCase|RegexOptions.Singleline);
_match = mainReg.Match(bookContent);

String pricePattern = ".+?￥(.+?)";
Regex priceReg = new Regex(pricePattern, RegexOptions.IgnoreCase);

_match = priceReg.Match(_match.Value);
bookInfo._price = _match.Groups[1].Value;
bookInfo._price = bookInfo._description.Trim();
#endregion

#region discount
String discountPattern = ""discount (.+?)<";
Regex discountReg = new Regex(discountPattern, RegexOptions.IgnoreCase);

_match = discountReg.Match(bookContent);
bookInfo._discount = _match.Groups[1].Value;
#endregion

// make sure it is a right book what we need
if (bookInfo._name.ToUpper().Contains(_keyword.ToUpper()))
{
_bookInfos.Add(bookInfo);
}
}

}
}
[/code]
<br/>
Hi guys i am trying to make a system that extract book inofrmation from various chinese websites now i have managed to extract results from one site but this site in particular, am having challenges i dont know whats wrong with my code please help.

View the full article

WATS WRONG WITH MY C# WEB DATA EXTRACTION CODE?

EDN Admin

Well-known member

Similar threads