Im trying to retrive web links address and to check for timeout and also if its exe in the link but

EDN Admin · Oct 2, 2012

<pre class="prettyprint using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using HtmlAgilityPack;
using System.IO;
using System.Text.RegularExpressions;
using System.Xml.Linq;
using System.Net;
using System.Web;
using System.Threading;
using DannyGeneral;

namespace GatherLinks
{
class TimeOut
{
private Form1 frm1reference;

public TimeOut(Form1 frm1)
{
frm1reference = frm1;
}

class MyClient : WebClient
{
public bool HeadOnly { get; set; }
protected override WebRequest GetWebRequest(Uri address)
{
WebRequest req = base.GetWebRequest(address);
if (HeadOnly && req.Method == "GET")
{
req.Method = "HEAD";
}
return req;
}
}

//****\

private static HtmlAgilityPack.HtmlDocument getHtmlDocumentWebClient(string url, bool useProxy, string proxyIp, int proxyPort, string usename, string password)
{
//**************************************\
using (MyClient clients = new MyClient())
{
clients.HeadOnly = true;
//string uri = "http://www.google.com";
byte[] body = clients.DownloadData(url);
// note should be 0-length
string type = clients.ResponseHeaders["content-type"];
clients.HeadOnly = false;
// check tis not binary... well use text/, but could
// check for text/html
if (type.StartsWith(@"text/"))
{
string text = clients.DownloadString(url);
MessageBox.Show("Test: " + text);//Console.WriteLine(text);
}
}

HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
WebClient client = new WebClient();
//client.Headers.Add("user-agent", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705

");
client.Credentials = CredentialCache.DefaultCredentials;
client.Proxy = WebRequest.DefaultWebProxy;
if (useProxy)
{
//Proxy
if (!string.IsNullOrEmpty(proxyIp))
{
WebProxy p = new WebProxy(proxyIp, proxyPort);
if (!string.IsNullOrEmpty(usename))
{
if (password == null)
password = string.Empty;
NetworkCredential nc = new NetworkCredential(usename, password);
p.Credentials = nc;
}
}
}

//Stream data = myHttpWebResponse.GetResponseStream();//client.OpenRead(url);
//client.OpenRead(url);
doc.Load(client.OpenRead(url)); // to check here when trying to Load link that end with exe !!!!!! like: http://appldnld.apple.com/iTunes10/041-7196.20120912.Ber43/iTunesSetup.exe
return doc; }

public HtmlAgilityPack.HtmlDocument GetHtmlDoc(string url, Options options, ProxyModel proxy)
{
try
{
//Wait before execute
if (url.EndsWith("exe"))
{
MessageBox.Show(url);
}
if (options != null)
{
if (options.Request_pause_milliseconds >= 0)
System.Threading.Thread.Sleep(options.Request_pause_milliseconds);
}
//Execute WITH Proxy
if (proxy != null)
{
//if proxy ip is empty use standard request
if (proxy.Ip.ToLower() != "")
return getHtmlDocumentWebClient(url, true, proxy.Ip, Int32.Parse(proxy.Port), proxy.Username, proxy.Password);
else return getHtmlDocumentWebClient(url, false, string.Empty, 0, string.Empty, string.Empty);
}
else return getHtmlDocumentWebClient(url, false, string.Empty, 0, string.Empty, string.Empty);
}
catch (TimeoutException tEx)
{
System.Threading.Thread.Sleep(options.Pause_after_timeout_milliseconds);
Logger.Write(string.Format("TimeoutException : Noc.HtmlScraper.Request.Load.GetHtmlDoc({0},options,proxies)" + url + " " + tEx));
frm1reference.timeOut = true;
return null;
}
catch (Exception ex)
{
Logger.Write("TimeoutException : Noc.HtmlScraper.Request.Load.GetHtmlDoc({0},options,proxies)" + url + " " + ex);
frm1reference.timeOut = true;
return null;
}
}

public class Options
{
public int Request_pause_milliseconds { get; set; }
public int Pause_after_timeout_milliseconds { get; set; }
public Options() { }
public Options(int request_pause_milliseconds, int Pause_after_timeout_milliseconds, bool EnableProxy)
{
this.Request_pause_milliseconds = request_pause_milliseconds;
this.Pause_after_timeout_milliseconds = Pause_after_timeout_milliseconds;
}
}
public class ProxyModel
{
public string Ip { get; set; }
public string Port { get; set; }
public string Username { get; set; }
public string Password { get; set; }
public ProxyModel() { }
public ProxyModel(string Ip, string Port, string Username, string Password)

{
this.Ip = Ip; this.Username = Username; this.Password = Password; this.Port = Port;
}
}
}
}
[/code]
Im using this class in the Form1 like this:

<pre class="prettyprint private List<string> getLinks(HtmlAgilityPack.HtmlDocument document)
{

List<string> mainLinks = new List<string>();
var linkNodes = document.DocumentNode.SelectNodes("//a[@href]");
if (linkNodes != null)
{
foreach (HtmlNode link in linkNodes)
{
var href = link.Attributes["href"].Value;
if (href.StartsWith("http://") == true || href.StartsWith("https://") == true || href.StartsWith("www") == true) // filter for http
{
mainLinks.Add(href);
}
}
}

return mainLinks;

}

private List<string> test(string url, int levels,DoWorkEventArgs eve)
{

levels = levelsToCrawl;
HtmlWeb hw = new HtmlWeb();
List<string> webSites;
List<string> csFiles = new List<string>();

csFiles.Add("temp string to know that something is happening in level = " + levels.ToString());
csFiles.Add("current site name in this level is : " + url);

try
{
this.Invoke(new MethodInvoker(delegate { Texts(richTextBox1, "Loading The Url: " , Color.Red); }));
this.Invoke(new MethodInvoker(delegate { Texts(richTextBox1, url + "...",Color.Blue); }));
HtmlAgilityPack.HtmlDocument doc = to.GetHtmlDoc(url, reqOptions, null);
/* if (doc == null)
{
this.Invoke(new MethodInvoker(delegate { Texts(richTextBox1, " Check The Link" + Environment.NewLine, Color.Green); }));
return csFiles;
}*/
//string html = doc.DocumentNode.InnerHtml;
//get Text
//string pageText = doc.DocumentNode.InnerText;
//doc = hw.Load(url);
if (timeOut == true)
{
this.Invoke(new MethodInvoker(delegate { Texts(richTextBox1, " There Was A TimeOut" + Environment.NewLine , Color.Green); }));
timeOut = false;
//return csFiles;
}
else
{
this.Invoke(new MethodInvoker(delegate { Texts(richTextBox1, " Done " + Environment.NewLine, Color.Red); }));
}

currentCrawlingSite.Add(url);
webSites = getLinks(doc);
removeDupes(webSites);
removeDuplicates(webSites, currentCrawlingSite);
removeDuplicates(webSites, sitesToCrawl);
if (removeExt == true)
{
removeExternals(webSites);
}
if (downLoadImages == true)
{
webContent.retrieveImages(url);
}

if (levels > 0)
sitesToCrawl.AddRange(webSites);

this.Invoke(new MethodInvoker(delegate { label7.Text = sitesToCrawl.Count.ToString(); }));
this.Invoke(new MethodInvoker(delegate { label3.Text = currentCrawlingSite.Count().ToString(); }));

if (levels == 0)
{
return csFiles;
}
else
{

for (int i = 0; i < webSites.Count(); i++)//&& i < 20; i++) // limiting ourseleves for 20 sites for each level for now..
//or it will take forever.
{
//int mx = Math.Min(webSites.Count(), 20);

string t = webSites;
if ((t.StartsWith("http://") == true) || (t.StartsWith("https://") == true))
{

csFiles.AddRange(webCrawler(t, levels - 1, eve));
}

}
return csFiles;
}

}
catch
{
return csFiles;
}

}[/code]
The problem when using the class of the TimeOut is that im not sure when its timeout and when some other exception in the new class. I have in the new class a bool variable wich im using in Form1 too :
<pre class="prettyprint frm1reference.timeOut = true;[/code]
And in Form1 i tried to check when its timeout and when not. But its not working.
Another problem is in the new class im using this code:

<pre class="prettyprint using (MyClient clients = new MyClient())
{
clients.HeadOnly = true;
//string uri = "http://www.google.com";
byte[] body = clients.DownloadData(url);
// note should be 0-length
string type = clients.ResponseHeaders["content-type"];
clients.HeadOnly = false;
// check tis not binary... well use text/, but could
// check for text/html
if (type.StartsWith(@"text/"))
{
string text = clients.DownloadString(url);
MessageBox.Show("Test: " + text);//Console.WriteLine(text);
}
}[/code]
It should check for cases like for example when the url contain a link like this:
<pre class="prettyprint http://appldnld.apple.com/iTunes10/041-7196.20120912.Ber43/iTunesSetup.exe [/code]
The problem is that its trying to get the link in Form1 and make the program the application to stuck freeze stop working for long time i need to stop it manual with Task Manager.
What i need to do is to check somehow when the link contain exe files so not to download/get this link/s but its not working.

So the problems are:

1. The timeout section still not working.
2. The checking for urls wich contain exe inside also not working.

Thanks.

<hr class="sig danieli

View the full article

Im trying to retrive web links address and to check for timeout and also if its exe in the link but

EDN Admin

Well-known member

Similar threads