A
abhisheksingha
Guest
string fileName = @"E:\TextboxPRINT_MS.pdf";
string searthText = @"Paul Chronopoulos";
pdfReader = new PdfReader(fileName);
//for (int page = 1; page <= pdfReader.NumberOfPages; page++)
{
// ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
// string currentPageText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
//Check for identifier
// if (currentPageText.Contains(searthText))
{
//Get Stream object
for (int i = 1; i <= pdfReader.XrefSize; i++)
{
//Get object.
PdfObject obj = pdfReader.GetPdfObject(i);
//Check for stream object.
if (obj != null && obj.IsStream())
{
//Get stream object.
PRStream stream = (PRStream)obj;
if (stream != null)
{
//PdfStream stream2 = (PdfStream)obj;
//Check for FlateDecode object.
PdfObject filterObj = stream.Get(PdfName.FILTER);
if (filterObj != null)
{
bool flateDecodeObj = filterObj.Equals(PdfName.FLATEDECODE);
if (flateDecodeObj)
{
//Get raw bytes.
byte[] streamBytes;
try
{
streamBytes = PdfReader.GetStreamBytes(stream);
}
catch (Exception ex)
{
streamBytes = PdfReader.GetStreamBytesRaw(stream);
}
if (streamBytes.Length != 0)
{
List<string> buf = new List<string>();
string streamData1 = Encoding.ASCII.GetString(streamBytes);
System.IO.File.WriteAllText(@"E:\Table.txt", streamData1);
PRTokeniser tok = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));
if (tok.Length > 0)
{
string identifier = string.Empty;
//string identifier2 = string.Empty;
while (tok.NextToken())
{
//if (tok.TokenType == PRTokeniser.TokType.ENDOFFILE)
//{
// break;
//}
if (tok.TokenType == PRTokeniser.TokType.STRING)
{
string st = tok.StringValue;
identifier = identifier + st;
}
identifier = identifier.Trim();
if (searthText== identifier)
{
if (tok.TokenType == PRTokeniser.TokType.NUMBER)
{
buf.Add(tok.StringValue);
//string st = tok.StringValue;
//identifier2 = identifier2 + st + " ";
}
else if (tok.TokenType == PRTokeniser.TokType.OTHER)
{
//Look for a rectangle token
if (tok.StringValue == "re")
{
break;
////Sanity check, make sure we have enough items in the buffer
//if (buf.Count < 4) throw new Exception("Not enough elements in buffer for a rectangle");
////Read and convert the values
//float x = float.Parse(buf[buf.Count - 4]);
//float y = float.Parse(buf[buf.Count - 3]);
//float w = float.Parse(buf[buf.Count - 2]);
//float h = float.Parse(buf[buf.Count - 1]);
////..do something with them here
}
}
}
}
if (buf.Count > 0)
{
MessageBox.Show("X = " + buf[0].ToString() + "\n" + "Y = " + buf[1].ToString() + "\n" + "Width = " + buf[2].ToString() + "\n" + "Height = " + buf[3].ToString());
return;
}
//try
//{
// byte[] bytes = Convert.FromBase64String(identifier);
// System.Text.UTF8Encoding encoder = new System.Text.UTF8Encoding();
// System.Text.Decoder decoder = encoder.GetDecoder();
// int count = decoder.GetCharCount(bytes, 0, bytes.Length);
// char[] arr = new char[count];
// decoder.GetChars(bytes, 0, bytes.Length, arr, 0);
// identifier = new string(arr);
//}
//catch (Exception exx)
//{
// MessageBox.Show(exx.Message);
//}
identifier = identifier.Trim();
// if (!searthText.Contains(identifier))
// continue;
if (searthText.Contains(identifier))
{
identifier = string.Empty;
PRTokeniser tok2 = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));
while (tok2.NextToken())
{
//if (tok.TokenType == PRTokeniser.TokType.ENDOFFILE)
//{
// break;
//}
if (tok2.TokenType == PRTokeniser.TokType.OTHER)
{
string st = tok2.StringValue;
identifier = identifier + st;
}
}
MessageBox.Show(identifier);
}
}
//
//PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.createSource(streamBytes)));
//PdfContentParser ps = new PdfContentParser(tokeniser);
//ArrayList<PdfObject> operands = new ArrayList<PdfObject>();
//while (ps.parse(operands).size() > 0) {
// // PdfLiteral operator = (PdfLiteral) operands.get(operands.size() - 1);
// // processOperator(operator, operands);
//}
// PRTokeniser token = new PRTokeniser(streamBytes);
//Get string from the raw bytes.
string streamData = Encoding.ASCII.GetString(streamBytes);
if (string.IsNullOrEmpty(streamData))
continue;
System.IO.File.WriteAllText(@"F:\Table.txt", streamData);
bool IsMCID = streamData.Contains("Tj");
// if (!IsMCID)
// continue;
//For debug
System.IO.File.WriteAllText(@"F:\Table.txt", streamData);
//continue;
//Get content tokens
string[] tokens = streamData.Split(new[] { "re" }, StringSplitOptions.None);
if (tokens.Length == 0)
continue;
//Get first token
string firstToken = tokens[0].Trim();
//Get last occurance index of the token.
int pFrom = firstToken.LastIndexOf("\n");
//Get the rect coordinates.
string rect = firstToken.Substring(pFrom + 1);
string[] rectArray = rect.Split(' ');
if (rectArray.Length == 4)
{
MessageBox.Show("X = " + rectArray[0].ToString() + "\n" + "Y = " + rectArray[1].ToString() + "\n" + "Width = " + rectArray[2].ToString() + "\n" + "Height = " + rectArray[3].ToString());
return;
}
}
}
}
}
}
}
}
}
}
catch (Exception ex)
{
MessageBox.Show(ex.Message);
}
}
}
}
Continue reading...
string searthText = @"Paul Chronopoulos";
pdfReader = new PdfReader(fileName);
//for (int page = 1; page <= pdfReader.NumberOfPages; page++)
{
// ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy();
// string currentPageText = PdfTextExtractor.GetTextFromPage(pdfReader, page, strategy);
//Check for identifier
// if (currentPageText.Contains(searthText))
{
//Get Stream object
for (int i = 1; i <= pdfReader.XrefSize; i++)
{
//Get object.
PdfObject obj = pdfReader.GetPdfObject(i);
//Check for stream object.
if (obj != null && obj.IsStream())
{
//Get stream object.
PRStream stream = (PRStream)obj;
if (stream != null)
{
//PdfStream stream2 = (PdfStream)obj;
//Check for FlateDecode object.
PdfObject filterObj = stream.Get(PdfName.FILTER);
if (filterObj != null)
{
bool flateDecodeObj = filterObj.Equals(PdfName.FLATEDECODE);
if (flateDecodeObj)
{
//Get raw bytes.
byte[] streamBytes;
try
{
streamBytes = PdfReader.GetStreamBytes(stream);
}
catch (Exception ex)
{
streamBytes = PdfReader.GetStreamBytesRaw(stream);
}
if (streamBytes.Length != 0)
{
List<string> buf = new List<string>();
string streamData1 = Encoding.ASCII.GetString(streamBytes);
System.IO.File.WriteAllText(@"E:\Table.txt", streamData1);
PRTokeniser tok = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));
if (tok.Length > 0)
{
string identifier = string.Empty;
//string identifier2 = string.Empty;
while (tok.NextToken())
{
//if (tok.TokenType == PRTokeniser.TokType.ENDOFFILE)
//{
// break;
//}
if (tok.TokenType == PRTokeniser.TokType.STRING)
{
string st = tok.StringValue;
identifier = identifier + st;
}
identifier = identifier.Trim();
if (searthText== identifier)
{
if (tok.TokenType == PRTokeniser.TokType.NUMBER)
{
buf.Add(tok.StringValue);
//string st = tok.StringValue;
//identifier2 = identifier2 + st + " ";
}
else if (tok.TokenType == PRTokeniser.TokType.OTHER)
{
//Look for a rectangle token
if (tok.StringValue == "re")
{
break;
////Sanity check, make sure we have enough items in the buffer
//if (buf.Count < 4) throw new Exception("Not enough elements in buffer for a rectangle");
////Read and convert the values
//float x = float.Parse(buf[buf.Count - 4]);
//float y = float.Parse(buf[buf.Count - 3]);
//float w = float.Parse(buf[buf.Count - 2]);
//float h = float.Parse(buf[buf.Count - 1]);
////..do something with them here
}
}
}
}
if (buf.Count > 0)
{
MessageBox.Show("X = " + buf[0].ToString() + "\n" + "Y = " + buf[1].ToString() + "\n" + "Width = " + buf[2].ToString() + "\n" + "Height = " + buf[3].ToString());
return;
}
//try
//{
// byte[] bytes = Convert.FromBase64String(identifier);
// System.Text.UTF8Encoding encoder = new System.Text.UTF8Encoding();
// System.Text.Decoder decoder = encoder.GetDecoder();
// int count = decoder.GetCharCount(bytes, 0, bytes.Length);
// char[] arr = new char[count];
// decoder.GetChars(bytes, 0, bytes.Length, arr, 0);
// identifier = new string(arr);
//}
//catch (Exception exx)
//{
// MessageBox.Show(exx.Message);
//}
identifier = identifier.Trim();
// if (!searthText.Contains(identifier))
// continue;
if (searthText.Contains(identifier))
{
identifier = string.Empty;
PRTokeniser tok2 = new PRTokeniser(new RandomAccessFileOrArray(streamBytes));
while (tok2.NextToken())
{
//if (tok.TokenType == PRTokeniser.TokType.ENDOFFILE)
//{
// break;
//}
if (tok2.TokenType == PRTokeniser.TokType.OTHER)
{
string st = tok2.StringValue;
identifier = identifier + st;
}
}
MessageBox.Show(identifier);
}
}
//
//PRTokeniser tokeniser = new PRTokeniser(new RandomAccessFileOrArray(RASFACTORY.createSource(streamBytes)));
//PdfContentParser ps = new PdfContentParser(tokeniser);
//ArrayList<PdfObject> operands = new ArrayList<PdfObject>();
//while (ps.parse(operands).size() > 0) {
// // PdfLiteral operator = (PdfLiteral) operands.get(operands.size() - 1);
// // processOperator(operator, operands);
//}
// PRTokeniser token = new PRTokeniser(streamBytes);
//Get string from the raw bytes.
string streamData = Encoding.ASCII.GetString(streamBytes);
if (string.IsNullOrEmpty(streamData))
continue;
System.IO.File.WriteAllText(@"F:\Table.txt", streamData);
bool IsMCID = streamData.Contains("Tj");
// if (!IsMCID)
// continue;
//For debug
System.IO.File.WriteAllText(@"F:\Table.txt", streamData);
//continue;
//Get content tokens
string[] tokens = streamData.Split(new[] { "re" }, StringSplitOptions.None);
if (tokens.Length == 0)
continue;
//Get first token
string firstToken = tokens[0].Trim();
//Get last occurance index of the token.
int pFrom = firstToken.LastIndexOf("\n");
//Get the rect coordinates.
string rect = firstToken.Substring(pFrom + 1);
string[] rectArray = rect.Split(' ');
if (rectArray.Length == 4)
{
MessageBox.Show("X = " + rectArray[0].ToString() + "\n" + "Y = " + rectArray[1].ToString() + "\n" + "Width = " + rectArray[2].ToString() + "\n" + "Height = " + rectArray[3].ToString());
return;
}
}
}
}
}
}
}
}
}
}
catch (Exception ex)
{
MessageBox.Show(ex.Message);
}
}
}
}
Continue reading...