EDN Admin
Well-known member
For example the site : http://www.walla.co.il
When im looking on the site source i see the hebrew words.
But in my code which is working with sites like cnn.com foxnew.com i dont see the hebrew words at all.
This is my code that is working for sites in english.using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Windows.Forms;
namespace ScrambleRandomWordsTest
{
public partial class Form1 : Form
{
private static readonly Random RandomGen = new Random();
private static readonly Regex AnyWordRegex = new Regex(@"((?<word>[a-zA-Z]{4,}))", RegexOptions.Singleline | RegexOptions.Compiled);
private static readonly Regex StripHtmlRegex = new Regex(@"(<[^>]*>)", RegexOptions.Singleline | RegexOptions.Compiled);
private const string OriginalHtmlFilePath = @"C:Templocalfile.html";
private const string StrippedHtmlFilePath = @"C:TemphtmlWithoutTags.html";
private string ScrambledHtmlFilePath = @"C:Templocalscrambledfile.html";
private List<string> _words;
private List<string> _originalWords;
private List<string> _lengthaboveone;
private List<string> _scrambledWords;
private string webSite;
public Form1()
{
InitializeComponent();
var uri = new Uri("http://www.walla.co.il");
var result = uri.Host.Split(.)[1];
ScrambledHtmlFilePath = @"C:Temp" + result + ".html";
webSite = uri.ToString();
_scrambledWords = new List<string>();
DownloadHtml();
string file = File.ReadAllText(OriginalHtmlFilePath);
string strippedHtml = StripHtml(file);
using (var streamWriter = new StreamWriter(StrippedHtmlFilePath))
{
streamWriter.Write(strippedHtml);
streamWriter.Close();
}
_words = ExtractWords(strippedHtml);
_originalWords = _words.ToList();
GetText();
InsertContent();
string html = File.ReadAllText(ScrambledHtmlFilePath);
webBrowser1.DocumentText = html;
}
private void Form1_Load(object sender, EventArgs e)
{
}
private void DownloadHtml()
{
using (var client = new WebClient())
{
client.DownloadFile(webSite, OriginalHtmlFilePath);
}
}
public static string StripHtml(string htmlString)
{
return StripHtmlRegex.Replace(htmlString, @"|");
}
private void GetText()
{
_lengthaboveone = new List<string>();
for (int i = 0; i < _words.Count; i++)
{
string word = _words;
if (word.Length < 4) continue;
string first = word.Substring(0, 1);
string last = word.Substring(word.Length - 1, 1);
string middle = word.Substring(1, word.Length - 2);
_lengthaboveone.Add(middle);
_words = first + MakeRandomwords(middle) + last;
}
_scrambledWords = _words;
}
private static StringBuilder MakeRandomwords(string theWord)
{
var jumbleSb = new StringBuilder();
jumbleSb.Append(theWord);
int lengthSb = jumbleSb.Length;
for (int i = 0; i < lengthSb; ++i)
{
int index1 = (RandomGen.Next() % lengthSb);
int index2 = (RandomGen.Next() % lengthSb);
Char temp = jumbleSb[index1];
jumbleSb[index1] = jumbleSb[index2];
jumbleSb[index2] = temp;
}
return jumbleSb;
}
private void InsertContent()
{
string originalHtml;
using (var streamReader = new StreamReader(OriginalHtmlFilePath))
{
originalHtml = streamReader.ReadToEnd();
}
int nextIndex = 0;
using (var streamWriter = new StreamWriter(ScrambledHtmlFilePath))
{
for (int i = 0; i < _originalWords.Count; i++)
{
string word = _originalWords;
string startTag = word;
int startTagWidth = startTag.Length;
int index = originalHtml.IndexOf(word, nextIndex > 0 ? nextIndex - 1 : 0, StringComparison.Ordinal);
if (index == -1) break;
string contextHtml = originalHtml.Substring(nextIndex, index - nextIndex);
streamWriter.Write(contextHtml);
streamWriter.Write(_scrambledWords);
nextIndex = index + startTagWidth;
}
streamWriter.Close();
}
}
private static List<string> ExtractWords(string text)
{
MatchCollection matchCollection = AnyWordRegex.Matches(text);
return (from Match match in matchCollection select match.Groups[1].Value).ToList();
}
}
}
But when the site is walla.co.il
And then im looking on the List _words in the constructor with a breakpoint i see 610 words only and all of them are in english.
What could be the reason it dosent know to handle hebrew letters and how can i solve it ?
View the full article
When im looking on the site source i see the hebrew words.
But in my code which is working with sites like cnn.com foxnew.com i dont see the hebrew words at all.
This is my code that is working for sites in english.using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.Windows.Forms;
namespace ScrambleRandomWordsTest
{
public partial class Form1 : Form
{
private static readonly Random RandomGen = new Random();
private static readonly Regex AnyWordRegex = new Regex(@"((?<word>[a-zA-Z]{4,}))", RegexOptions.Singleline | RegexOptions.Compiled);
private static readonly Regex StripHtmlRegex = new Regex(@"(<[^>]*>)", RegexOptions.Singleline | RegexOptions.Compiled);
private const string OriginalHtmlFilePath = @"C:Templocalfile.html";
private const string StrippedHtmlFilePath = @"C:TemphtmlWithoutTags.html";
private string ScrambledHtmlFilePath = @"C:Templocalscrambledfile.html";
private List<string> _words;
private List<string> _originalWords;
private List<string> _lengthaboveone;
private List<string> _scrambledWords;
private string webSite;
public Form1()
{
InitializeComponent();
var uri = new Uri("http://www.walla.co.il");
var result = uri.Host.Split(.)[1];
ScrambledHtmlFilePath = @"C:Temp" + result + ".html";
webSite = uri.ToString();
_scrambledWords = new List<string>();
DownloadHtml();
string file = File.ReadAllText(OriginalHtmlFilePath);
string strippedHtml = StripHtml(file);
using (var streamWriter = new StreamWriter(StrippedHtmlFilePath))
{
streamWriter.Write(strippedHtml);
streamWriter.Close();
}
_words = ExtractWords(strippedHtml);
_originalWords = _words.ToList();
GetText();
InsertContent();
string html = File.ReadAllText(ScrambledHtmlFilePath);
webBrowser1.DocumentText = html;
}
private void Form1_Load(object sender, EventArgs e)
{
}
private void DownloadHtml()
{
using (var client = new WebClient())
{
client.DownloadFile(webSite, OriginalHtmlFilePath);
}
}
public static string StripHtml(string htmlString)
{
return StripHtmlRegex.Replace(htmlString, @"|");
}
private void GetText()
{
_lengthaboveone = new List<string>();
for (int i = 0; i < _words.Count; i++)
{
string word = _words;
if (word.Length < 4) continue;
string first = word.Substring(0, 1);
string last = word.Substring(word.Length - 1, 1);
string middle = word.Substring(1, word.Length - 2);
_lengthaboveone.Add(middle);
_words = first + MakeRandomwords(middle) + last;
}
_scrambledWords = _words;
}
private static StringBuilder MakeRandomwords(string theWord)
{
var jumbleSb = new StringBuilder();
jumbleSb.Append(theWord);
int lengthSb = jumbleSb.Length;
for (int i = 0; i < lengthSb; ++i)
{
int index1 = (RandomGen.Next() % lengthSb);
int index2 = (RandomGen.Next() % lengthSb);
Char temp = jumbleSb[index1];
jumbleSb[index1] = jumbleSb[index2];
jumbleSb[index2] = temp;
}
return jumbleSb;
}
private void InsertContent()
{
string originalHtml;
using (var streamReader = new StreamReader(OriginalHtmlFilePath))
{
originalHtml = streamReader.ReadToEnd();
}
int nextIndex = 0;
using (var streamWriter = new StreamWriter(ScrambledHtmlFilePath))
{
for (int i = 0; i < _originalWords.Count; i++)
{
string word = _originalWords;
string startTag = word;
int startTagWidth = startTag.Length;
int index = originalHtml.IndexOf(word, nextIndex > 0 ? nextIndex - 1 : 0, StringComparison.Ordinal);
if (index == -1) break;
string contextHtml = originalHtml.Substring(nextIndex, index - nextIndex);
streamWriter.Write(contextHtml);
streamWriter.Write(_scrambledWords);
nextIndex = index + startTagWidth;
}
streamWriter.Close();
}
}
private static List<string> ExtractWords(string text)
{
MatchCollection matchCollection = AnyWordRegex.Matches(text);
return (from Match match in matchCollection select match.Groups[1].Value).ToList();
}
}
}
But when the site is walla.co.il
And then im looking on the List _words in the constructor with a breakpoint i see 610 words only and all of them are in english.
What could be the reason it dosent know to handle hebrew letters and how can i solve it ?
View the full article