Im trying to parse all links from a website but im getting exception what could be wrong?

EDN Admin

Well-known member
Joined
Aug 7, 2010
Messages
12,794
Location
In the Machine
This is the class im using:

<pre class="prettyprint using System;
using System.Collections.Generic;
using System.Text;
using System.Net;
using System.IO;
using System.Text.RegularExpressions;

namespace FindAllUrls
{
class GetUrls
{

//public method called from your application
public void RetrieveUrls(string webPage)
{
GetAllUrls(RetrieveContent(webPage));
}

//get the content of the web page passed in
private string RetrieveContent(string webPage)
{
HttpWebResponse response = null;//used to get response
StreamReader respStream = null;//used to read response into string
try
{
//create a request object using the url passed in
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(webPage);
request.Timeout = 10000;

//go get a response from the page
response = (HttpWebResponse)request.GetResponse();

//create a streamreader object from the response
respStream = new StreamReader(response.GetResponseStream());

//get the contents of the page as a string and return it
return respStream.ReadToEnd();
}
catch (Exception ex)//houston we have a problem!
{
throw ex;
}
finally
{
//close it down, were going home!
response.Close();
respStream.Close();
}
}

//using a regular expression, find all of the href or urls
//in the content of the page
private void GetAllUrls(string content)
{
//regular expression
string pattern = @"(?:hrefs*=)(?:[s""]*)(?!#|mailto|location.|javascript|.*css|.*this.)(?
.*?)(?:[s>""])";

//Set up regex object
Regex RegExpr = new Regex(pattern, RegexOptions.IgnoreCase);

//get the first match
Match match = RegExpr.Match(content);

//loop through matches
while (match.Success)
{

//output the match info
Console.WriteLine("href match: " + match.Groups[0].Value);
WriteToLog("C:matchlog.txt", "href match: " + match.Groups[0].Value + "rn");

Console.WriteLine("Url match: " + match.Groups[1].Value);
WriteToLog("C:matchlog.txt", "Url | Location | mailto match: " + match.Groups[1].Value + "rn");

//get next match
match = match.NextMatch();
}
}

//Write to a log file
private void WriteToLog(string file, string message)
{
using (StreamWriter w = File.AppendText(file))
{
w.WriteLine(DateTime.Now.ToString() + ": " + message); w.Close();
}
}
}
}[/code]
<br/>
The exception is on the line:
<pre class="lang-cs prettyprint
Code:
<span class="typ Regex<span class="pln  <span class="typ RegExpr<span class="pln  <span class="pun =<span class="pln  <span class="kwd new<span class="pln  <span class="typ Regex<span class="pun (<span class="pln pattern<span class="pun ,<span class="pln  <span class="typ RegexOptions<span class="pun .<span class="typ IgnoreCase<span class="pun );<span class="pln <br/>
[/code]
The exception error is:
parsing "(?:hrefs*=)(?:[s"]*)(?!#|mailto|location.|javascript|.*css|.this.)(? .?)(?:[s>"])" - Unrecognized grouping construct.
What is wrong ? And how can i fix it ?
<hr class="sig danieli

View the full article
 
Back
Top