EDN Admin
Well-known member
Hi
SimHash algorithm is explained here:
http://matpalm.com/resemblance/simhash/" target="_blank" title="Sim Hash Algorithm http://matpalm.com/resemblance/simhash <br/>
I searched for a C# (or VB.NET) implementation of it but there was not even one on the Internet. What a shame! Thus I decided to implement it in C#. Would you please have a look at my implementation and tell me if it is correct?
<div style="color:Black;background-color:White; <pre>
<span style="color:Blue; public <span style="color:Blue; class SimHash
{
<span style="color:Blue; public <span style="color:Blue; float GetLikenessValue(<span style="color:Blue; string needle, <span style="color:Blue; string haystack)
{
<span style="color:Blue; var needleSimHash = DoCalculateSimHash(needle);
<span style="color:Blue; var hayStackSimHash = DoCalculateSimHash(haystack);
<span style="color:Blue; return (<span style="color:Blue; float) GetHammingDistance(needleSimHash, hayStackSimHash); <span style="color:Green; // bigger Hamming Distance means less likeness
}
<span style="color:Blue; int DoCalculateSimHash(<span style="color:Blue; string input)
{
<span style="color:Blue; const <span style="color:Blue; int hashSize = 32;
<span style="color:Blue; var hashedtokens = DoHashTokens(input.Split(<span style="color:#A31515; ));
<span style="color:Blue; var vector = <span style="color:Blue; new <span style="color:Blue; int[hashSize];
<span style="color:Blue; for (<span style="color:Blue; int i = 0; i < hashedtokens.Count; i++ ) <span style="color:Green; // for each is slow
{
<span style="color:Blue; var value = hashedtokens;
<span style="color:Blue; for (<span style="color:Blue; int j=0; j<hashSize;j ++)
{
<span style="color:Blue; if (IsBitSet(value,j))
vector[j] += 1;
<span style="color:Blue; else
vector[j] -= 1;
}
}
<span style="color:Blue; var fingerprint = 0;
<span style="color:Blue; for (<span style="color:Blue; int i = 0; i < hashSize; i++)
<span style="color:Blue; if (vector > 0)
fingerprint += 1 << i;
<span style="color:Blue; return fingerprint;
}
<span style="color:Blue; private <span style="color:Blue; static List<<span style="color:Blue; int> DoHashTokens(IEnumerable<<span style="color:Blue; string> tokens)
{
<span style="color:Blue; var hashedTokens = <span style="color:Blue; new List<<span style="color:Blue; int>();
<span style="color:Blue; foreach (<span style="color:Blue; var token <span style="color:Blue; in tokens)
hashedTokens.Add(token.GetHashCode());
<span style="color:Blue; return hashedTokens;
}
<span style="color:Blue; bool IsBitSet(<span style="color:Blue; int b, <span style="color:Blue; int pos)
{
<span style="color:Blue; return (b & (1 << pos)) != 0;
}
<span style="color:Blue; int GetHammingDistance(<span style="color:Blue; int firstValue, <span style="color:Blue; int secondValue)
{
<span style="color:Blue; var hammingBits = firstValue ^ secondValue;
<span style="color:Blue; var hammingValue = 0;
<span style="color:Blue; for (<span style="color:Blue; int i = 0; i < 32; i++)
<span style="color:Blue; if (IsBitSet(hammingBits, i))
hammingValue += 1;
<span style="color:Blue; return hammingValue;
}
}
}
[/code]
<br/>
P.S. SimHash has been patented by Google. Please dont use the implementation.<br/>
<hr class="sig Aref Karimi MCTS and MCPD
View the full article
SimHash algorithm is explained here:
http://matpalm.com/resemblance/simhash/" target="_blank" title="Sim Hash Algorithm http://matpalm.com/resemblance/simhash <br/>
I searched for a C# (or VB.NET) implementation of it but there was not even one on the Internet. What a shame! Thus I decided to implement it in C#. Would you please have a look at my implementation and tell me if it is correct?
<div style="color:Black;background-color:White; <pre>
<span style="color:Blue; public <span style="color:Blue; class SimHash
{
<span style="color:Blue; public <span style="color:Blue; float GetLikenessValue(<span style="color:Blue; string needle, <span style="color:Blue; string haystack)
{
<span style="color:Blue; var needleSimHash = DoCalculateSimHash(needle);
<span style="color:Blue; var hayStackSimHash = DoCalculateSimHash(haystack);
<span style="color:Blue; return (<span style="color:Blue; float) GetHammingDistance(needleSimHash, hayStackSimHash); <span style="color:Green; // bigger Hamming Distance means less likeness
}
<span style="color:Blue; int DoCalculateSimHash(<span style="color:Blue; string input)
{
<span style="color:Blue; const <span style="color:Blue; int hashSize = 32;
<span style="color:Blue; var hashedtokens = DoHashTokens(input.Split(<span style="color:#A31515; ));
<span style="color:Blue; var vector = <span style="color:Blue; new <span style="color:Blue; int[hashSize];
<span style="color:Blue; for (<span style="color:Blue; int i = 0; i < hashedtokens.Count; i++ ) <span style="color:Green; // for each is slow
{
<span style="color:Blue; var value = hashedtokens;
<span style="color:Blue; for (<span style="color:Blue; int j=0; j<hashSize;j ++)
{
<span style="color:Blue; if (IsBitSet(value,j))
vector[j] += 1;
<span style="color:Blue; else
vector[j] -= 1;
}
}
<span style="color:Blue; var fingerprint = 0;
<span style="color:Blue; for (<span style="color:Blue; int i = 0; i < hashSize; i++)
<span style="color:Blue; if (vector > 0)
fingerprint += 1 << i;
<span style="color:Blue; return fingerprint;
}
<span style="color:Blue; private <span style="color:Blue; static List<<span style="color:Blue; int> DoHashTokens(IEnumerable<<span style="color:Blue; string> tokens)
{
<span style="color:Blue; var hashedTokens = <span style="color:Blue; new List<<span style="color:Blue; int>();
<span style="color:Blue; foreach (<span style="color:Blue; var token <span style="color:Blue; in tokens)
hashedTokens.Add(token.GetHashCode());
<span style="color:Blue; return hashedTokens;
}
<span style="color:Blue; bool IsBitSet(<span style="color:Blue; int b, <span style="color:Blue; int pos)
{
<span style="color:Blue; return (b & (1 << pos)) != 0;
}
<span style="color:Blue; int GetHammingDistance(<span style="color:Blue; int firstValue, <span style="color:Blue; int secondValue)
{
<span style="color:Blue; var hammingBits = firstValue ^ secondValue;
<span style="color:Blue; var hammingValue = 0;
<span style="color:Blue; for (<span style="color:Blue; int i = 0; i < 32; i++)
<span style="color:Blue; if (IsBitSet(hammingBits, i))
hammingValue += 1;
<span style="color:Blue; return hammingValue;
}
}
}
[/code]
<br/>
P.S. SimHash has been patented by Google. Please dont use the implementation.<br/>
<hr class="sig Aref Karimi MCTS and MCPD
View the full article