OcrPage Method (GdPicturePDF)
Runs the optical character recognition (OCR) on the currently selected page of the loaded PDF document according to what you have specified. The recognized text is added as invisible text on the page. The page orientation is automatically detected.
This method involves a rasterization process so any existing visible text within the current page will become a part of the image of that page before the OCR process starts. The same applies to the invisible text contained within the current page. It is not kept because of the rasterization process, which simply means any invisible text is removed from the page before the OCR process starts.
This method uses one thread when processing. You can also benefit from using several OCR related events like GdPicturePDF.BeforePageOcr, GdPicturePDF.OcrPagesProgress and GdPicturePDF.OcrPagesDone.
Runs the optical character recognition (OCR) on the currently selected page of the loaded PDF document.
public function OcrPage(
: String;
: String;
: String;
: Single
): GdPictureStatus;
public function OcrPage(
: String,
: String,
: String,
: float
) : GdPictureStatus;
'Declaration
Public Function OcrPage( _
ByVal As String, _
ByVal As String, _
ByVal As String, _
ByVal As Single _
) As GdPictureStatus
Parameters
- Dictionary
- The prefix of the dictionary file to use, for example, "spa" for Spanish, "eng" for English, "fra" for French, etc.
The name of such dictionary file has a predefined format [LANGUAGE].traineddata, where [LANGUAGE] defines the used language. You can find these files within your standard installation usually in the directory @\GdPicture.Net 14\Redist\OCR or you can download additional language dictionary files here.
You can also combine multiple dictionaries with the "+" separator, for instance English with French is "eng+fra".
- DictionaryPath
- The path with all installed dictionary files the OCR engine will use. The proper path is usually within your standard installation
and it looks like @\GdPicture.Net 14\Redist\OCR. Of course you can specify your own path as well.
Set null if you are using the GdPicture.Resource NuGet package
- CharWhiteList
- So called white list of characters, in other words the restricted recognition characters. It means that the engine only returns the specified characters when processing.
For example, if you want to only recognize numeric characters, set this parameter to "0123456789". If you want to only recognize uppercase letters,
set it to "ABCDEFGHIJKLMNOPQRSTUVWXYZ". Set this parameter to the empty string to recognize all characters.
- DPI
- The dpi resolution the OCR engine will use. It is recommended to use 300 by default.
A value between 200 and 300 should give optimal results on A4-sized documents. Generally values over 300 will cause excessive memory usage.
Return Value
A member of the GdPictureStatus enumeration. If the method has been successfully followed, then the return value is GdPictureStatus.OK.
We strongly recommend always checking this status first.
How to apply OCR method on your scanned PDF document.
You can find another OCR examples included in the Programming section within the GdPicture.NET Reference Guide as well.
Dim caption As String = "Example: OcrPage"
Dim message As String = ""
Dim page_text As String = ""
'Expecting that the input pdf document includes scanned pages.
Const inputPdfPath As String = "test.pdf"
Const outputPdfPath As String = "test_done.pdf"
Const textFile As String = "test_text_after_ocr.txt"
'The path to language-files installed for OCR - please make sure this path is correct according to your installation.
Const pathToOcr As String = "C:\GdPicture.NET 14\Redist\OCR"
Dim gdpicturePDF As New GdPicturePDF()
If gdpicturePDF.LoadFromFile(inputPdfPath, False) = GdPictureStatus.OK Then
Dim pageCount As Integer = gdpicturePDF.GetPageCount()
If gdpicturePDF.GetStat() <> GdPictureStatus.OK Then
MessageBox.Show("The GetPageCount() method has failed with the status: " + gdpicturePDF.GetStat().ToString(), caption)
Goto [Error]
End If
If pageCount = 0 Then
MessageBox.Show("This input PDF document contains no pages.", caption)
Goto [Error]
End If
Dim text_file As New System.IO.StreamWriter(textFile)
For i As Integer = 1 To pageCount
If gdpicturePDF.SelectPage(i) = GdPictureStatus.OK Then
Dim hasText As Boolean = gdpicturePDF.PageHasText()
If gdpicturePDF.GetStat() = GdPictureStatus.OK Then
If hasText AndAlso (MessageBox.Show("The page nr." + i.ToString() + " already has text. Do you want to skip this page?",
caption, MessageBoxButtons.YesNo, MessageBoxIcon.[Stop]) = System.Windows.Forms.DialogResult.Yes) Then
message = message + "The page nr." + i.ToString() + " has been skipped." + vbCrLf
text_file.WriteLine("The page nr." + i.ToString() + " has been skipped.")
Continue For
End If
If gdpicturePDF.OcrPage("eng", pathToOcr, "", 300) = GdPictureStatus.OK Then
message = message + "The page nr." + i.ToString() + " has been successfully processed." + vbCrLf
text_file.WriteLine("The page nr." + i.ToString() + " contains this text:")
page_text = gdpicturePDF.GetPageText()
If gdpicturePDF.GetStat() = GdPictureStatus.OK Then
text_file.WriteLine(page_text)
End If
Else
message = message + "The OcrPage() method has failed for the page nr." + i.ToString() + " with the status: " + gdpicturePDF.GetStat().ToString() + vbCrLf
text_file.WriteLine("The page nr." + i.ToString() + " has not been processed.")
End If
Else
message = message + "The PageHasText() method has failed for the page nr." + i.ToString() + " with the status: " + gdpicturePDF.GetStat().ToString() + vbCrLf
text_file.WriteLine("The page nr." + i.ToString() + " has not been processed.")
End If
Else
message = message + "The SelectPage() method has failed for the page nr." + i.ToString() + " with the status: " + gdpicturePDF.GetStat().ToString() + vbCrLf
text_file.WriteLine("The page nr." + i.ToString() + " has not been processed.")
End If
Next
If gdpicturePDF.SaveToFile(outputPdfPath, True) = GdPictureStatus.OK Then
message = message + "The file has been successfully saved."
Else
message = message + "The file can't be saved."
End If
MessageBox.Show(message, caption)
text_file.Close()
Else
MessageBox.Show("The file can't be loaded.", caption)
End If
[error]:
gdpicturePDF.Dispose()
string caption = "Example: OcrPage";
string message = "";
string page_text = "";
//Expecting that the input pdf document includes scanned pages.
const string inputPdfPath = "test.pdf";
const string outputPdfPath = "test_done.pdf";
const string textFile = "test_text_after_ocr.txt";
//The path to language-files installed for OCR - please make sure this path is correct according to your installation.
const string pathToOcr = "C:\\GdPicture.NET 14\\Redist\\OCR";
GdPicturePDF gdpicturePDF = new GdPicturePDF();
if (gdpicturePDF.LoadFromFile(inputPdfPath, false) == GdPictureStatus.OK)
{
int pageCount = gdpicturePDF.GetPageCount();
if (gdpicturePDF.GetStat() != GdPictureStatus.OK)
{
MessageBox.Show("The GetPageCount() method has failed with the status: " + gdpicturePDF.GetStat().ToString(), caption);
goto error;
}
if (pageCount == 0)
{
MessageBox.Show("This input PDF document contains no pages.", caption);
goto error;
}
System.IO.StreamWriter text_file = new System.IO.StreamWriter(textFile);
for (int i = 1; i <= pageCount; i++)
{
if (gdpicturePDF.SelectPage(i) == GdPictureStatus.OK)
{
bool hasText = gdpicturePDF.PageHasText();
if (gdpicturePDF.GetStat() == GdPictureStatus.OK)
{
if (hasText &&
(MessageBox.Show("The page nr." + i.ToString() + " already has text. Do you want to skip this page?",
caption, MessageBoxButtons.YesNo, MessageBoxIcon.Stop) == System.Windows.Forms.DialogResult.Yes))
{
message = message + "The page nr." + i.ToString() + " has been skipped.\n";
text_file.WriteLine("The page nr." + i.ToString() + " has been skipped.");
continue;
}
if (gdpicturePDF.OcrPage("eng", pathToOcr, "", 300) == GdPictureStatus.OK)
{
message = message + "The page nr." + i.ToString() + " has been successfully processed.\n";
text_file.WriteLine("The page nr." + i.ToString() + " contains this text:");
page_text = gdpicturePDF.GetPageText();
if (gdpicturePDF.GetStat() == GdPictureStatus.OK)
text_file.WriteLine(page_text);
}
else
{
message = message + "The OcrPage() method has failed for the page nr." + i.ToString() + " with the status: " + gdpicturePDF.GetStat().ToString() + "\n";
text_file.WriteLine("The page nr." + i.ToString() + " has not been processed.");
}
}
else
{
message = message + "The PageHasText() method has failed for the page nr." + i.ToString() + " with the status: " + gdpicturePDF.GetStat().ToString() + "\n";
text_file.WriteLine("The page nr." + i.ToString() + " has not been processed.");
}
}
else
{
message = message + "The SelectPage() method has failed for the page nr." + i.ToString() + " with the status: " + gdpicturePDF.GetStat().ToString() + "\n";
text_file.WriteLine("The page nr." + i.ToString() + " has not been processed.");
}
}
if (gdpicturePDF.SaveToFile(outputPdfPath, true) == GdPictureStatus.OK)
message = message + "The file has been successfully saved.";
else
message = message + "The file can't be saved.";
MessageBox.Show(message, caption);
text_file.Close();
}
else
MessageBox.Show("The file can't be loaded.", caption);
error:
gdpicturePDF.Dispose();