EDN Admin
Well-known member
This is how im getting all the links from a url to a List<string>
<pre class="prettyprint private List<string> getLinks(HtmlAgilityPack.HtmlDocument document)
{
List<string> mainLinks = new List<string>();
var linkNodes = document.DocumentNode.SelectNodes("//a[@href]");
if (linkNodes != null)
{
foreach (HtmlNode link in linkNodes)
{
var href = link.Attributes["href"].Value;
if (href.StartsWith("http://") ==true || href.StartsWith("https://") ==true || href.StartsWith("www.") ==true) // filter for http
{
mainLinks.Add(href);
}
}
}
return mainLinks;
}[/code]
Then im putting the links i got loading it to a Doc:
<pre class="prettyprint private List<string> test(string url, int levels, DoWorkEventArgs eve)
{
HtmlWeb hw = new HtmlWeb();
List<string> webSites try
{
doc = hw.Load(url);
webSites = getLinks(doc);
retrieveImages();[/code]
So now webSites wich is a List<string> contain the links.
Then i have the function retieveImages() wich is where i want to get all images from the link im in now for example the variable url lets say now in url its www.google.com
So i want to get a List<string> of all the images names from the url and later after i will also would like to download the images to my hard disk.
<pre class="prettyprint private List<string> retrieveImages()
{
List<string> imgList = new List<string>();
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.Load(url); //or whatever HTML file you have
HtmlNodeCollection imgs = doc.DocumentNode.SelectNodes("//img[@src]");
if (imgs == null) return new List<string>();
foreach (HtmlNode img in imgs)
{
if (img.Attributes["src"] == null)
continue;
HtmlAttribute src = img.Attributes["src"];
imgList.Add(src.Value);
//Do something with src.Value such as Get the image and save it locally
// Image img = GetImage(src.Value)
// img.Save(aLocalFilePath);
}
return imgList;
}
private Image GetImage(string url)
{
System.Net.WebRequest request = System.Net.WebRequest.Create(url);
System.Net.WebResponse response = request.GetResponse();
System.IO.Stream responseStream = response.GetResponseStream();
Bitmap bmp = new Bitmap(responseStream);
responseStream.Dispose();
return bmp;
} [/code]
But what im not sure is how to use the function retrieveImages() ?
Do i need each url i visit in to download its source content first and then put it in the doc.Load() ? If so how do i download the url content source to my hard disk so i can Load it ?
Or maybe there is another way to do it directly from the url without download the source first to my hard disk ?
For sure this: doc.Load(url); is not working now it is not the right way to do it. Is there any way to do it without download the the url source content to my hard disk and save it as htm/l and do it directly from the url somehow ? <hr class="sig danieli
View the full article
<pre class="prettyprint private List<string> getLinks(HtmlAgilityPack.HtmlDocument document)
{
List<string> mainLinks = new List<string>();
var linkNodes = document.DocumentNode.SelectNodes("//a[@href]");
if (linkNodes != null)
{
foreach (HtmlNode link in linkNodes)
{
var href = link.Attributes["href"].Value;
if (href.StartsWith("http://") ==true || href.StartsWith("https://") ==true || href.StartsWith("www.") ==true) // filter for http
{
mainLinks.Add(href);
}
}
}
return mainLinks;
}[/code]
Then im putting the links i got loading it to a Doc:
<pre class="prettyprint private List<string> test(string url, int levels, DoWorkEventArgs eve)
{
HtmlWeb hw = new HtmlWeb();
List<string> webSites try
{
doc = hw.Load(url);
webSites = getLinks(doc);
retrieveImages();[/code]
So now webSites wich is a List<string> contain the links.
Then i have the function retieveImages() wich is where i want to get all images from the link im in now for example the variable url lets say now in url its www.google.com
So i want to get a List<string> of all the images names from the url and later after i will also would like to download the images to my hard disk.
<pre class="prettyprint private List<string> retrieveImages()
{
List<string> imgList = new List<string>();
HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument();
doc.Load(url); //or whatever HTML file you have
HtmlNodeCollection imgs = doc.DocumentNode.SelectNodes("//img[@src]");
if (imgs == null) return new List<string>();
foreach (HtmlNode img in imgs)
{
if (img.Attributes["src"] == null)
continue;
HtmlAttribute src = img.Attributes["src"];
imgList.Add(src.Value);
//Do something with src.Value such as Get the image and save it locally
// Image img = GetImage(src.Value)
// img.Save(aLocalFilePath);
}
return imgList;
}
private Image GetImage(string url)
{
System.Net.WebRequest request = System.Net.WebRequest.Create(url);
System.Net.WebResponse response = request.GetResponse();
System.IO.Stream responseStream = response.GetResponseStream();
Bitmap bmp = new Bitmap(responseStream);
responseStream.Dispose();
return bmp;
} [/code]
But what im not sure is how to use the function retrieveImages() ?
Do i need each url i visit in to download its source content first and then put it in the doc.Load() ? If so how do i download the url content source to my hard disk so i can Load it ?
Or maybe there is another way to do it directly from the url without download the source first to my hard disk ?
For sure this: doc.Load(url); is not working now it is not the right way to do it. Is there any way to do it without download the the url source content to my hard disk and save it as htm/l and do it directly from the url somehow ? <hr class="sig danieli
View the full article