c# code running slower than php

EDN Admin

Well-known member
Joined
Aug 7, 2010
Messages
12,794
Location
In the Machine
I have developed a web crawler in php which work really well. It crawls nicely but i didnt think it was fast. It still did crawl about 10 urls in a few seconds. So i moved to c# because i thought that compiled languages are faster than interpreted ones.
It does the exact same as the php one and crawl then stored in mysql database but crawles only 1 url in a few minutes?!?!?
It uses regex and mysql so i dont know if that is slowing it down, and since i am only a beginner in c# I dont know any quick short cuts or faster methods like in php.
heres my code

<div style="color:Black;background-color:White; <pre>
<span style="color:Blue; using System;
<span style="color:Blue; using System.Collections.Generic;
<span style="color:Blue; using System.ComponentModel;
<span style="color:Blue; using System.Data;
<span style="color:Blue; using System.Linq;
<span style="color:Blue; using System.Text;
<span style="color:Blue; using System.Net;
<span style="color:Blue; using System.IO;
<span style="color:Blue; using System.Threading;
<span style="color:Blue; using System.Timers;
<span style="color:Blue; using System.Text.RegularExpressions;
<span style="color:Blue; using MySql.Data.MySqlClient;

<span style="color:Blue; namespace MLBot
{
<span style="color:Blue; class Program
{
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; string cururl;
<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; int urlcount = 0;

<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; void insert_url(<span style="color:Blue; string url)
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();

MySqlCommand insert = con.CreateCommand();
insert.CommandText = <span style="color:#A31515; "INSERT INTO `urls` VALUES (, " + url + <span style="color:#A31515; ", 0)";
insert.ExecuteNonQuery();

con.Close();
}

<span style="color:Blue; private <span style="color:Blue; void crawl()
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();

MySqlCommand getUrls = con.CreateCommand();
getUrls.CommandText = <span style="color:#A31515; "SELECT `url` FROM `urls` WHERE `crawled`=0 LIMIT 1";
MySqlDataReader reader = getUrls.ExecuteReader();
reader.Read();
cururl = reader[<span style="color:#A31515; "url"].ToString();

reader.Close();

<span style="color:Blue; string html = get_html(cururl);

<span style="color:Blue; if(!System.String.IsNullOrEmpty(html)) {
MatchCollection links = <span style="color:Blue; null;
links = Regex.Matches(html, <span style="color:#A31515; @"(<a.*?>.*? )", RegexOptions.Singleline);
<span style="color:Blue; foreach (Match a <span style="color:Blue; in links)
{
<span style="color:Blue; string value = a.Groups[1].Value;

Match href = Regex.Match(value, <span style="color:#A31515; @"href=""(.*?)""", RegexOptions.Singleline);

<span style="color:Blue; if (href.Success)
{
<span style="color:Blue; string link = href.Groups[1].Value;
link = fix_url(cururl, link);
<span style="color:Blue; if (valid_url(link) && !in_db(link))
{
insert_url(link);
}
}
}
}

urlcount++;
Console.WriteLine(urlcount + <span style="color:#A31515; ": " + cururl);

clean(cururl);
con.Close();

crawl();
}



<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; void clean(<span style="color:Blue; string url)
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();

MySqlCommand setcrawled = con.CreateCommand();
setcrawled.CommandText = <span style="color:#A31515; "UPDATE `urls` SET `crawled`=1 WHERE `url`=" + url + <span style="color:#A31515; "";
setcrawled.ExecuteNonQuery();

con.Close();
}

<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; bool in_db(<span style="color:Blue; string link)
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();

MySqlCommand check = con.CreateCommand();
check.CommandText = <span style="color:#A31515; "SELECT COUNT(url) as `results` FROM `urls` WHERE `url`=" + link + <span style="color:#A31515; "";
MySqlDataReader reader = check.ExecuteReader();
reader.Read();
<span style="color:Blue; int rows = Convert.ToInt32(reader[<span style="color:#A31515; "results"]);

con.Close();

<span style="color:Blue; if(rows < 1) {
<span style="color:Blue; return <span style="color:Blue; false;
} <span style="color:Blue; else {
<span style="color:Blue; return <span style="color:Blue; true;
}
}

<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; bool valid_url(<span style="color:Blue; string link)
{
<span style="color:Blue; if(<span style="color:Blue; string.IsNullOrEmpty(get_html(link))) {
<span style="color:Blue; return <span style="color:Blue; false;
} <span style="color:Blue; else {
<span style="color:Blue; return <span style="color:Blue; true;
}
}

<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; string fix_url(<span style="color:Blue; string baseurl, <span style="color:Blue; string path)
{
<span style="color:Blue; var baseUri = <span style="color:Blue; new Uri(baseurl);
<span style="color:Blue; var absoluteUri = <span style="color:Blue; new Uri(baseUri, path);
<span style="color:Blue; string url;

<span style="color:Blue; try
{
url = absoluteUri.ToString();
}
<span style="color:Blue; catch
{
<span style="color:Blue; return baseurl;
}

<span style="color:Blue; return url;
}

<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; string get_html(<span style="color:Blue; string url)
{
<span style="color:Blue; try
{
WebClient client = <span style="color:Blue; new WebClient();
String html = client.DownloadString(url);
<span style="color:Blue; return html;
}
<span style="color:Blue; catch
{
<span style="color:Blue; return <span style="color:Blue; null;
}
}

<span style="color:Blue; private <span style="color:Blue; static <span style="color:Blue; void cleardb()
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();

MySqlCommand setnotcrawling = con.CreateCommand();
setnotcrawling.CommandText = <span style="color:#A31515; "TRUNCATE TABLE `urls`";
setnotcrawling.ExecuteNonQuery();
}

<span style="color:Blue; static <span style="color:Blue; void Main(<span style="color:Blue; string[] args)
{
<span style="color:Blue; string constring = <span style="color:#A31515; "Server=localhost;Port=3306;Database=crawler;Uid=root;Password=;";
MySqlConnection con = <span style="color:Blue; new MySqlConnection(constring);
con.Open();
Console.Write(<span style="color:#A31515; "What do you want to do? ");
<span style="color:Blue; string command = Console.ReadLine();
Console.WriteLine();
<span style="color:Blue; switch(command) {
<span style="color:Blue; case <span style="color:#A31515; "cleardb":
cleardb();
<span style="color:Blue; break;

<span style="color:Blue; default:
Console.Write(<span style="color:#A31515; "Enter a url: ");
<span style="color:Blue; string inputurl = Console.ReadLine();
<span style="color:Blue; if (!in_db(inputurl)) { insert_url(inputurl); }
Console.WriteLine();
Program crawler = <span style="color:Blue; new Program();
crawler.crawl();
<span style="color:Blue; break;
}
}
}
}

[/code]

<br/>
i know its very long, but I have no idea about shortening it.
Thanks,
Matthew

View the full article
 
Back
Top