EDN Admin
Well-known member
I have developed a web crawler in php which work really well. It crawls nicely but i didnt think it was fast. It still did crawl about 10 urls in a few seconds. So i moved to c# because i thought that compiled languages are faster than interpreted ones.
It does the exact same as the php one and crawl then stored in mysql database but crawles only 1 url in a few minutes?!?!?
It uses regex and mysql so i dont know if that is slowing it down, and since i am only a beginner in c# I dont know any quick short cuts or faster methods like in php.
heres my code
<div style="color:Black;background-color:White; <pre>
<span style="color:Blue; using System;
<span style="color:Blue; using System.Collections.Generic;
<span style="color:Blue; using System.ComponentModel;
<span style="color:Blue; using System.Data;
<span style="color:Blue; using System.Linq;
<span style="color:Blue; using System.Text;
<span style="color:Blue; using System.Net;
<span style="color:Blue; using System.IO;
<span style="color:Blue; using System.Threading;
<span style="color:Blue; using System.Timers;
<span style="color:Blue; using System.Text.RegularExpressions;
<span style="color:Blue; using MySql.Data.MySqlClient;
<span style="color:Blue; namespace MLBot
{
<span style="color:Blue; class Program
{
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; string cururl;
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; int urlcount = 0;
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; void insert_url(<span style="color:Blue; string url)
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();
MySqlCommand insert = con.CreateCommand();
insert.CommandText = <span style="color:#A31515; "INSERT INTO `urls` VALUES (, " + url + <span style="color:#A31515; ", 0)";
insert.ExecuteNonQuery();
con.Close();
}
<span style="color:Blue; private <span style="color:Blue; void crawl()
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();
MySqlCommand getUrls = con.CreateCommand();
getUrls.CommandText = <span style="color:#A31515; "SELECT `url` FROM `urls` WHERE `crawled`=0 LIMIT 1";
MySqlDataReader reader = getUrls.ExecuteReader();
reader.Read();
cururl = reader[<span style="color:#A31515; "url"].ToString();
reader.Close();
<span style="color:Blue; string html = get_html(cururl);
<span style="color:Blue; if(!System.String.IsNullOrEmpty(html)) {
MatchCollection links = <span style="color:Blue; null;
links = Regex.Matches(html, <span style="color:#A31515; @"(<a.*?>.*? )", RegexOptions.Singleline);
<span style="color:Blue; foreach (Match a <span style="color:Blue; in links)
{
<span style="color:Blue; string value = a.Groups[1].Value;
Match href = Regex.Match(value, <span style="color:#A31515; @"href=""(.*?)""", RegexOptions.Singleline);
<span style="color:Blue; if (href.Success)
{
<span style="color:Blue; string link = href.Groups[1].Value;
link = fix_url(cururl, link);
<span style="color:Blue; if (valid_url(link) && !in_db(link))
{
insert_url(link);
}
}
}
}
urlcount++;
Console.WriteLine(urlcount + <span style="color:#A31515; ": " + cururl);
clean(cururl);
con.Close();
crawl();
}
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; void clean(<span style="color:Blue; string url)
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();
MySqlCommand setcrawled = con.CreateCommand();
setcrawled.CommandText = <span style="color:#A31515; "UPDATE `urls` SET `crawled`=1 WHERE `url`=" + url + <span style="color:#A31515; "";
setcrawled.ExecuteNonQuery();
con.Close();
}
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; bool in_db(<span style="color:Blue; string link)
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();
MySqlCommand check = con.CreateCommand();
check.CommandText = <span style="color:#A31515; "SELECT COUNT(url) as `results` FROM `urls` WHERE `url`=" + link + <span style="color:#A31515; "";
MySqlDataReader reader = check.ExecuteReader();
reader.Read();
<span style="color:Blue; int rows = Convert.ToInt32(reader[<span style="color:#A31515; "results"]);
con.Close();
<span style="color:Blue; if(rows < 1) {
<span style="color:Blue; return <span style="color:Blue; false;
} <span style="color:Blue; else {
<span style="color:Blue; return <span style="color:Blue; true;
}
}
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; bool valid_url(<span style="color:Blue; string link)
{
<span style="color:Blue; if(<span style="color:Blue; string.IsNullOrEmpty(get_html(link))) {
<span style="color:Blue; return <span style="color:Blue; false;
} <span style="color:Blue; else {
<span style="color:Blue; return <span style="color:Blue; true;
}
}
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; string fix_url(<span style="color:Blue; string baseurl, <span style="color:Blue; string path)
{
<span style="color:Blue; var baseUri = <span style="color:Blue; new Uri(baseurl);
<span style="color:Blue; var absoluteUri = <span style="color:Blue; new Uri(baseUri, path);
<span style="color:Blue; string url;
<span style="color:Blue; try
{
url = absoluteUri.ToString();
}
<span style="color:Blue; catch
{
<span style="color:Blue; return baseurl;
}
<span style="color:Blue; return url;
}
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; string get_html(<span style="color:Blue; string url)
{
<span style="color:Blue; try
{
WebClient client = <span style="color:Blue; new WebClient();
String html = client.DownloadString(url);
<span style="color:Blue; return html;
}
<span style="color:Blue; catch
{
<span style="color:Blue; return <span style="color:Blue; null;
}
}
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; void cleardb()
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();
MySqlCommand setnotcrawling = con.CreateCommand();
setnotcrawling.CommandText = <span style="color:#A31515; "TRUNCATE TABLE `urls`";
setnotcrawling.ExecuteNonQuery();
}
<span style="color:Blue; static <span style="color:Blue; void Main(<span style="color:Blue; string[] args)
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();
Console.Write(<span style="color:#A31515; "What do you want to do? ");
<span style="color:Blue; string command = Console.ReadLine();
Console.WriteLine();
<span style="color:Blue; switch(command) {
<span style="color:Blue; case <span style="color:#A31515; "cleardb":
cleardb();
<span style="color:Blue; break;
<span style="color:Blue; default:
Console.Write(<span style="color:#A31515; "Enter a url: ");
<span style="color:Blue; string inputurl = Console.ReadLine();
<span style="color:Blue; if (!in_db(inputurl)) { insert_url(inputurl); }
Console.WriteLine();
Program crawler = <span style="color:Blue; new Program();
crawler.crawl();
<span style="color:Blue; break;
}
}
}
}
[/code]
<br/>
i know its very long, but I have no idea about shortening it.
Thanks,
Matthew
View the full article
It does the exact same as the php one and crawl then stored in mysql database but crawles only 1 url in a few minutes?!?!?
It uses regex and mysql so i dont know if that is slowing it down, and since i am only a beginner in c# I dont know any quick short cuts or faster methods like in php.
heres my code
<div style="color:Black;background-color:White; <pre>
<span style="color:Blue; using System;
<span style="color:Blue; using System.Collections.Generic;
<span style="color:Blue; using System.ComponentModel;
<span style="color:Blue; using System.Data;
<span style="color:Blue; using System.Linq;
<span style="color:Blue; using System.Text;
<span style="color:Blue; using System.Net;
<span style="color:Blue; using System.IO;
<span style="color:Blue; using System.Threading;
<span style="color:Blue; using System.Timers;
<span style="color:Blue; using System.Text.RegularExpressions;
<span style="color:Blue; using MySql.Data.MySqlClient;
<span style="color:Blue; namespace MLBot
{
<span style="color:Blue; class Program
{
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; string cururl;
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; int urlcount = 0;
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; void insert_url(<span style="color:Blue; string url)
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();
MySqlCommand insert = con.CreateCommand();
insert.CommandText = <span style="color:#A31515; "INSERT INTO `urls` VALUES (, " + url + <span style="color:#A31515; ", 0)";
insert.ExecuteNonQuery();
con.Close();
}
<span style="color:Blue; private <span style="color:Blue; void crawl()
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();
MySqlCommand getUrls = con.CreateCommand();
getUrls.CommandText = <span style="color:#A31515; "SELECT `url` FROM `urls` WHERE `crawled`=0 LIMIT 1";
MySqlDataReader reader = getUrls.ExecuteReader();
reader.Read();
cururl = reader[<span style="color:#A31515; "url"].ToString();
reader.Close();
<span style="color:Blue; string html = get_html(cururl);
<span style="color:Blue; if(!System.String.IsNullOrEmpty(html)) {
MatchCollection links = <span style="color:Blue; null;
links = Regex.Matches(html, <span style="color:#A31515; @"(<a.*?>.*? )", RegexOptions.Singleline);
<span style="color:Blue; foreach (Match a <span style="color:Blue; in links)
{
<span style="color:Blue; string value = a.Groups[1].Value;
Match href = Regex.Match(value, <span style="color:#A31515; @"href=""(.*?)""", RegexOptions.Singleline);
<span style="color:Blue; if (href.Success)
{
<span style="color:Blue; string link = href.Groups[1].Value;
link = fix_url(cururl, link);
<span style="color:Blue; if (valid_url(link) && !in_db(link))
{
insert_url(link);
}
}
}
}
urlcount++;
Console.WriteLine(urlcount + <span style="color:#A31515; ": " + cururl);
clean(cururl);
con.Close();
crawl();
}
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; void clean(<span style="color:Blue; string url)
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();
MySqlCommand setcrawled = con.CreateCommand();
setcrawled.CommandText = <span style="color:#A31515; "UPDATE `urls` SET `crawled`=1 WHERE `url`=" + url + <span style="color:#A31515; "";
setcrawled.ExecuteNonQuery();
con.Close();
}
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; bool in_db(<span style="color:Blue; string link)
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();
MySqlCommand check = con.CreateCommand();
check.CommandText = <span style="color:#A31515; "SELECT COUNT(url) as `results` FROM `urls` WHERE `url`=" + link + <span style="color:#A31515; "";
MySqlDataReader reader = check.ExecuteReader();
reader.Read();
<span style="color:Blue; int rows = Convert.ToInt32(reader[<span style="color:#A31515; "results"]);
con.Close();
<span style="color:Blue; if(rows < 1) {
<span style="color:Blue; return <span style="color:Blue; false;
} <span style="color:Blue; else {
<span style="color:Blue; return <span style="color:Blue; true;
}
}
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; bool valid_url(<span style="color:Blue; string link)
{
<span style="color:Blue; if(<span style="color:Blue; string.IsNullOrEmpty(get_html(link))) {
<span style="color:Blue; return <span style="color:Blue; false;
} <span style="color:Blue; else {
<span style="color:Blue; return <span style="color:Blue; true;
}
}
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; string fix_url(<span style="color:Blue; string baseurl, <span style="color:Blue; string path)
{
<span style="color:Blue; var baseUri = <span style="color:Blue; new Uri(baseurl);
<span style="color:Blue; var absoluteUri = <span style="color:Blue; new Uri(baseUri, path);
<span style="color:Blue; string url;
<span style="color:Blue; try
{
url = absoluteUri.ToString();
}
<span style="color:Blue; catch
{
<span style="color:Blue; return baseurl;
}
<span style="color:Blue; return url;
}
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; string get_html(<span style="color:Blue; string url)
{
<span style="color:Blue; try
{
WebClient client = <span style="color:Blue; new WebClient();
String html = client.DownloadString(url);
<span style="color:Blue; return html;
}
<span style="color:Blue; catch
{
<span style="color:Blue; return <span style="color:Blue; null;
}
}
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; void cleardb()
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();
MySqlCommand setnotcrawling = con.CreateCommand();
setnotcrawling.CommandText = <span style="color:#A31515; "TRUNCATE TABLE `urls`";
setnotcrawling.ExecuteNonQuery();
}
<span style="color:Blue; static <span style="color:Blue; void Main(<span style="color:Blue; string[] args)
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();
Console.Write(<span style="color:#A31515; "What do you want to do? ");
<span style="color:Blue; string command = Console.ReadLine();
Console.WriteLine();
<span style="color:Blue; switch(command) {
<span style="color:Blue; case <span style="color:#A31515; "cleardb":
cleardb();
<span style="color:Blue; break;
<span style="color:Blue; default:
Console.Write(<span style="color:#A31515; "Enter a url: ");
<span style="color:Blue; string inputurl = Console.ReadLine();
<span style="color:Blue; if (!in_db(inputurl)) { insert_url(inputurl); }
Console.WriteLine();
Program crawler = <span style="color:Blue; new Program();
crawler.crawl();
<span style="color:Blue; break;
}
}
}
}
[/code]
<br/>
i know its very long, but I have no idea about shortening it.
Thanks,
Matthew
View the full article