using System;
using System.Diagnostics;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
using mshtml;
namespace HB.Net
{
/// <summary>
/// Creates a managed wrapper for a <see cref="mshtml.HTMLDocument"/> object.
/// </summary>
public class HtmlDocument : IDisposable
{
private bool _deleteWhenDone;
/// <summary>
/// The underlying <see cref="mshtml.HTMLDocument"/>.
/// </summary>
public readonly HTMLDocument MsHtmlDoc;
/// <summary>
/// The file path of the downloaded html document.
/// </summary>
public readonly string LocalPath;
private bool _disposed;
/// <summary>
/// The content of the webpage.
/// </summary>
public readonly string AsciiData;
private static readonly Regex ScriptParser;
private static readonly Regex FileExtRemover;
static HtmlDocument()
{
string[] tags = new string[] {"script", /*"style", */"object", "head", "map", "iframe", "javascript"};
string scriptParserPattern = @"<(" + string.Join("|", tags) + @">).*?</\1>";
ScriptParser = new Regex(scriptParserPattern, RegexOptions.Compiled | RegexOptions.IgnoreCase | RegexOptions.Singleline);
FileExtRemover = new Regex(@"\.\w+$", RegexOptions.Compiled);
}
/// <summary>
/// Creates a <see cref="HtmlDocument"/> from the binary data of a webpage.
/// </summary>
/// <param name="data">The binary data of the webpage</param>
/// <param name="removeScripting">true to remove javascript.</param>
public HtmlDocument(byte[] data, bool removeScripting)
: this(CreateFile(data), true, removeScripting)
{
}
public HtmlDocument(string html, bool removeScripting)
: this(html, removeScripting, Encoding.ASCII)
{
}
public HtmlDocument(string html, bool removeScripting, Encoding encoding)
: this(encoding.GetBytes(html), removeScripting)
{
}
/// <summary>
/// Creates a <see cref="HtmlDocument"/> from a webpage file.
/// </summary>
/// <param name="filename">The file path of the webpage.</param>
/// <param name="deleteFile">true to delete the file on dispose.</param>
/// <param name="removeScripting">set to true to remove script tags.</param>
public HtmlDocument(string filename, bool deleteFile, bool removeScripting)
{
_disposed = false;
_deleteWhenDone = true;
LocalPath = filename;
try
{
if(removeScripting)
Preparse(filename);
MsHtmlDoc = CreateHTMLDocument(out AsciiData);
}
catch
{
try
{
File.Delete(filename);
}
catch{}
throw;
}
}
/// <summary>
/// Deletes the webpage and closes the underlying <see cref="mshtml.HTMLDocument"/> object.
/// </summary>
public void Dispose()
{
if(_disposed)
return;
MsHtmlDoc.close();
if(_deleteWhenDone)
{
try
{
File.Delete(LocalPath);
}
catch{}
}
_disposed = true;
GC.SuppressFinalize(this);
}
~HtmlDocument()
{
try
{
Dispose();
}
catch{}
}
/// <summary>
/// Creates a HTMLDocument.
/// </summary>
private HTMLDocumentClass CreateHTMLDocument(out string asciiData)
{
byte[] _htmlData;
FileStream file = File.OpenRead(LocalPath);
try
{
_htmlData = new byte[file.Length];
for(int read = 0; read < file.Length;)
read+=file.Read(_htmlData, read, (int)(file.Length - read));
}
finally
{
file.Close();
}
HTMLDocumentClass htmlDoc = new HTMLDocumentClass();
try
{
System.Runtime.InteropServices.UCOMIPersistFile pf = (System.Runtime.InteropServices.UCOMIPersistFile) htmlDoc;
pf.Load(LocalPath, 0);
while(htmlDoc.body == null)
System.Windows.Forms.Application.DoEvents();
while(htmlDoc.readyState != "complete")
System.Windows.Forms.Application.DoEvents();
asciiData = Encoding.ASCII.GetString(_htmlData);
}
catch(Exception e)
{
htmlDoc.close();
throw new ApplicationException("An error occurred while creating a mshtml.HTMLDocumentClass object.", e);
}
return htmlDoc;
}
/// <summary>
/// Removies scripting from a html file.
/// </summary>
/// <param name="filename">The path of the file.</param>
public void Preparse(string filename)
{
//read in txt file
TextReader file = File.OpenText(filename);
string text = null;
try
{
text = file.ReadToEnd();
text = ScriptParser.Replace(text, "");
}
finally
{
file.Close();
}
TextWriter output = File.CreateText(filename);
try
{
output.Write(text);
output.Flush();
}
finally
{
output.Close();
}
}
private static string CreateTempHtmlFile()
{
while(true)
{
string filename = Path.GetTempFileName();
try
{
string htmlFileName = FileExtRemover.Replace(filename, ".html");
File.Move(filename, htmlFileName);
return htmlFileName;
}
catch
{
File.Delete(filename);
}
}
}
/// <summary>
/// Creates a html file from an array of bytes.
/// </summary>
/// <param name="data">The array of bytes to create the data from.</param>
private static string CreateFile(byte[] data)
{
string filename = CreateTempHtmlFile();
FileStream file = File.OpenWrite(filename);
try
{
file.Write(data, 0, data.Length);
file.Flush();
return filename;
}
catch
{
try
{
File.Delete(filename);
}
catch{}
throw;
}
finally
{
file.Close();
}
}
/// <summary>
/// Returns the content of the html document.
/// </summary>
/// <returns>The content of the html document.</returns>
[System.Diagnostics.DebuggerStepThrough]
public override string ToString()
{
return AsciiData;
}
}
}