S
Shahid Manzoor Bhat
Guest
I am currently working on a problem to find the best matches of data in a List namely "SourceData" from another List called "TargetData". Whenever I find a match of an element for "Sourcedata" with any element in "TargetData" which has a confidence and which the user supplied or greater I add the matched string from TargetData and the string in SourceData to a tuple which i further save in a database.
Levenshtien algorithm gives me a number which I compare with the threshold of which the user has enetered and if the values returned is equal or greater than the percent threshold I append it with the original string element of "TargetData".
The code that I have written for this process works fine if the records in "SourceData" and "TargetData" are within thousands of values and if I increase the records to a million it takes about hours to calculate the distance for each element of the SourceData.
I need to optimize the process for huge data sets. Please advise where do I need to make the improvements.
My code for the process so far looks like this
public static async Task<DataCleanseResult> PerformFuzzyMatchAsync(Transaction transaction, TransactionMetrics transactionMetrics)
{
Tuple<bool, int> sourceUpdateStatusWithCount = null;
DataCleanseResult fuzzyMatchResult = new DataCleanseResult();
fuzzyMatchResult.status = false;
var matchedWords = new ConcurrentBag<Tuple<long, string, string, string, string, string, double>>();
string auditMessage = "Fuzzy match started";
MaintainLog(auditMessage, transaction, true, "");
Stopwatch timer = new Stopwatch();
var fuzzyMatchData = new FuzzyMatchInput();
decimal totalSourceRowCount = 0;
try
{
fuzzyMatchData = await FuzzyMatchRepository.FetchSourceDestinationData(transaction.TransactionId);
int splitGroupSize = Convert.ToInt32(ConfigurationManager.AppSettings["SplitGroupSize"]);
int splitGroupSizeTargetData = Convert.ToInt32(ConfigurationManager.AppSettings["SplitGroupSizeTargetData"]);
var threshold = (double)fuzzyMatchData.Threshold;
//Batch Source as well as Target Data
totalSourceRowCount = fuzzyMatchData.SourceDataCount;
decimal totalTargetDataCount = fuzzyMatchData.TargetDataCount;
var sourceDataBatchesCount = Math.Ceiling(totalSourceRowCount / splitGroupSize);
var targetDataBatchesCount = Math.Ceiling(totalTargetDataCount / splitGroupSizeTargetData);
timer.Start();
for (int a = 0; a < targetDataBatchesCount; a++)
{
int skipRowCountTargetData = a * splitGroupSizeTargetData;
int takeRowCountTargetData = splitGroupSizeTargetData;
var currentTargetDataBatch = FuzzyMatchRepository.FetchTargetDataBatch(transaction.TransactionId, fuzzyMatchData.TargetTableName, fuzzyMatchData.TargetColumnName, skipRowCountTargetData, takeRowCountTargetData, fuzzyMatchData.SourceDataConnectionString);
for (int b = 0; b < sourceDataBatchesCount; b++)
{
var currentBatchMatchedWords = new ConcurrentBag<Tuple<long, string, string, string, string, string, double>>();
int skipRowCount = b * splitGroupSize;
int takeRowCount = splitGroupSize;
var currentSourceDataBatch = FuzzyMatchRepository.FetchSourceDataBatch(transaction.TransactionId, fuzzyMatchData.SourceTableName, fuzzyMatchData.SourceColumnName, skipRowCount, takeRowCount, fuzzyMatchData.SourceDataConnectionString);
if (currentSourceDataBatch.Count > 0)
{
var validatedValues = new List<string>();
var pairs = (from wordToMatch in currentSourceDataBatch
from similarWord in currentTargetDataBatch//cross product
select new LevenshtienInput { WordToMatch = wordToMatch, SimilarWord = similarWord });
var matches = pairs.
AsParallel().
Where(pair => IsLevenshteinMatch(pair, threshold)).
ToList();
foreach (var match in matches)
{
currentBatchMatchedWords.Add(Tuple.Create(transaction.TransactionId, fuzzyMatchData.SourceTableName, fuzzyMatchData.SourceColumnName, fuzzyMatchData.SourceDataConnectionString, match.WordToMatch, match.SimilarWord, match.Similarity));
}
if (currentBatchMatchedWords.Count > 0)
{
// Save the Batch.
}
}
else
break;
}
}
timer.Stop();
auditMessage = "Fuzzy match done.";
MaintainLog(auditMessage, transaction, true, "");
}
catch (Exception ex)
{
throw ex;
}
return fuzzyMatchResult;
}
And my Levenshtien method does the following:
private static bool IsLevenshteinMatch(LevenshtienInput pair, double threshold)
{
int leven = Levenshtein.Distance(pair.WordToMatch, pair.SimilarWord);
int length = Math.Max(pair.WordToMatch.Length, pair.SimilarWord.Length);
double similarity = 1.0 - (double)leven / length;
if (similarity >= threshold)
{
pair.Similarity = similarity;
return true;
}
return false;
}
Continue reading...
Levenshtien algorithm gives me a number which I compare with the threshold of which the user has enetered and if the values returned is equal or greater than the percent threshold I append it with the original string element of "TargetData".
The code that I have written for this process works fine if the records in "SourceData" and "TargetData" are within thousands of values and if I increase the records to a million it takes about hours to calculate the distance for each element of the SourceData.
I need to optimize the process for huge data sets. Please advise where do I need to make the improvements.
My code for the process so far looks like this
public static async Task<DataCleanseResult> PerformFuzzyMatchAsync(Transaction transaction, TransactionMetrics transactionMetrics)
{
Tuple<bool, int> sourceUpdateStatusWithCount = null;
DataCleanseResult fuzzyMatchResult = new DataCleanseResult();
fuzzyMatchResult.status = false;
var matchedWords = new ConcurrentBag<Tuple<long, string, string, string, string, string, double>>();
string auditMessage = "Fuzzy match started";
MaintainLog(auditMessage, transaction, true, "");
Stopwatch timer = new Stopwatch();
var fuzzyMatchData = new FuzzyMatchInput();
decimal totalSourceRowCount = 0;
try
{
fuzzyMatchData = await FuzzyMatchRepository.FetchSourceDestinationData(transaction.TransactionId);
int splitGroupSize = Convert.ToInt32(ConfigurationManager.AppSettings["SplitGroupSize"]);
int splitGroupSizeTargetData = Convert.ToInt32(ConfigurationManager.AppSettings["SplitGroupSizeTargetData"]);
var threshold = (double)fuzzyMatchData.Threshold;
//Batch Source as well as Target Data
totalSourceRowCount = fuzzyMatchData.SourceDataCount;
decimal totalTargetDataCount = fuzzyMatchData.TargetDataCount;
var sourceDataBatchesCount = Math.Ceiling(totalSourceRowCount / splitGroupSize);
var targetDataBatchesCount = Math.Ceiling(totalTargetDataCount / splitGroupSizeTargetData);
timer.Start();
for (int a = 0; a < targetDataBatchesCount; a++)
{
int skipRowCountTargetData = a * splitGroupSizeTargetData;
int takeRowCountTargetData = splitGroupSizeTargetData;
var currentTargetDataBatch = FuzzyMatchRepository.FetchTargetDataBatch(transaction.TransactionId, fuzzyMatchData.TargetTableName, fuzzyMatchData.TargetColumnName, skipRowCountTargetData, takeRowCountTargetData, fuzzyMatchData.SourceDataConnectionString);
for (int b = 0; b < sourceDataBatchesCount; b++)
{
var currentBatchMatchedWords = new ConcurrentBag<Tuple<long, string, string, string, string, string, double>>();
int skipRowCount = b * splitGroupSize;
int takeRowCount = splitGroupSize;
var currentSourceDataBatch = FuzzyMatchRepository.FetchSourceDataBatch(transaction.TransactionId, fuzzyMatchData.SourceTableName, fuzzyMatchData.SourceColumnName, skipRowCount, takeRowCount, fuzzyMatchData.SourceDataConnectionString);
if (currentSourceDataBatch.Count > 0)
{
var validatedValues = new List<string>();
var pairs = (from wordToMatch in currentSourceDataBatch
from similarWord in currentTargetDataBatch//cross product
select new LevenshtienInput { WordToMatch = wordToMatch, SimilarWord = similarWord });
var matches = pairs.
AsParallel().
Where(pair => IsLevenshteinMatch(pair, threshold)).
ToList();
foreach (var match in matches)
{
currentBatchMatchedWords.Add(Tuple.Create(transaction.TransactionId, fuzzyMatchData.SourceTableName, fuzzyMatchData.SourceColumnName, fuzzyMatchData.SourceDataConnectionString, match.WordToMatch, match.SimilarWord, match.Similarity));
}
if (currentBatchMatchedWords.Count > 0)
{
// Save the Batch.
}
}
else
break;
}
}
timer.Stop();
auditMessage = "Fuzzy match done.";
MaintainLog(auditMessage, transaction, true, "");
}
catch (Exception ex)
{
throw ex;
}
return fuzzyMatchResult;
}
And my Levenshtien method does the following:
private static bool IsLevenshteinMatch(LevenshtienInput pair, double threshold)
{
int leven = Levenshtein.Distance(pair.WordToMatch, pair.SimilarWord);
int length = Math.Max(pair.WordToMatch.Length, pair.SimilarWord.Length);
double similarity = 1.0 - (double)leven / length;
if (similarity >= threshold)
{
pair.Similarity = similarity;
return true;
}
return false;
}
Continue reading...