Extract tabular data from a PDF

This example demonstrates how to extract tables from a PDF document using the Nutrient Document Converter Services (DCS) API. The extracted table can be returned as an Excel spreadsheet or JSON file.

Prerequisites

  • Nutrient Document Converter Services (DCS) running and accessible.
  • Appropriate service license for using the table extraction functionality.
  • Implemented OpenService() and CloseService() methods from DocumentConverterServiceClient sample code.

Sample code

/// <summary>
/// Extract tabular data from a PDF.
/// </summary>
/// <param name="ServiceURL">URL endpoint for the PDF Converter service.</param>
/// <param name="sourceFileName">Source filename.</param>
/// <param name="targetFolder">Target folder to receive the output file.</param>
/// <param name="outputFileType">XLSX or JSON.</param>
/// <param name="languages">List of languages.</param>
static void TestTableExtract(string ServiceURL, string sourceFileName, string targetFolder, string outputFileType, string languages = "eng")
{
Console.WriteLine($"Extracting attachments from {sourceFileName}");
DocumentConverterServiceClient client = null;
// Create an `OpenOptions` instance with minimum properties needed for file identification.
OpenOptions openOptions = new OpenOptions();
openOptions.FileExtension = Path.GetExtension(sourceFileName);
openOptions.OriginalFileName = Path.GetFileName(sourceFileName);
// Create a `TableExtractionSettings` object.
TableExtractionSettings settings = new TableExtractionSettings();
settings.DPI = "300";
settings.SeparateTables = BooleanEnum.True;
settings.EnableOrientationDetection = BooleanEnum.True;
settings.EnableSkewDetection = BooleanEnum.True;
settings.RenderFormFields = BooleanEnum.True;
settings.OutputFileType = outputFileType;
settings.OCRLanguage = languages;
try
{
// Determine the source file and read it into a byte array.
byte[] sourceFile = File.ReadAllBytes(sourceFileName);
// Open the service and configure the bindings.
client = OpenService(ServiceURL);
// Carry out the conversion.
BatchResult result = client.ExtractTables(sourceFile, openOptions, settings);
if(result != null)
{
// Create the target folder if it does not exist.
if (!Directory.Exists(targetFolder))
{
Directory.CreateDirectory(targetFolder);
}
Console.WriteLine($"Output to: {targetFolder}");
// Get the filename.
string filename = result.FileName;
Console.WriteLine(filename);
// Write the result to a file.
File.WriteAllBytes(Path.Combine(targetFolder, filename), result.File);
}
else
{
Console.WriteLine("No result returned");
}
}
finally
{
if (client != null)
{
CloseService(client);
}
}
}