Extract tabular data from a PDF

This example demonstrates how to extract tables from a PDF document using the Nutrient Document Converter Services (DCS) API. The extracted table can be returned as an Excel spreadsheet or JSON file.

Prerequisites

  • Nutrient Document Converter Services (DCS) running and accessible.

  • Appropriate service license for using the table extraction functionality.

  • Implemented OpenService() and CloseService() helpers.

Sample code

/// <summary>
        /// Extract tabular data from a PDF.
        /// </summary>
        /// <param name="ServiceURL">URL endpoint for the PDF Converter service.</param>
        /// <param name="sourceFileName">Source filename.</param>
        /// <param name="targetFolder">Target folder to receive the output file.</param>
        /// <param name="outputFileType">XLSX or JSON.</param>
        /// <param name="languages">List of languages.</param>
        static void TestTableExtract(string ServiceURL, string sourceFileName, string targetFolder, string outputFileType, string languages = "eng")
        {
            Console.WriteLine($"Extracting attachments from {sourceFileName}");

            DocumentConverterServiceClient client = null;
            // Create an `OpenOptions` instance with minimum properties needed for file identification.
            OpenOptions openOptions = new OpenOptions();
            openOptions.FileExtension = Path.GetExtension(sourceFileName);
            openOptions.OriginalFileName = Path.GetFileName(sourceFileName);

            // Create a `TableExtractionSettings` object.
            TableExtractionSettings settings = new TableExtractionSettings();
            settings.DPI = "300";
            settings.SeparateTables = BooleanEnum.True;
            settings.EnableOrientationDetection = BooleanEnum.True;
            settings.EnableSkewDetection = BooleanEnum.True;
            settings.RenderFormFields = BooleanEnum.True;
            settings.OutputFileType = outputFileType;
            settings.OCRLanguage = languages;

            try
            {
                // Determine the source file and read it into a byte array.
                byte[] sourceFile = File.ReadAllBytes(sourceFileName);

                // Open the service and configure the bindings.
                client = OpenService(ServiceURL);

                // Carry out the conversion.
                BatchResult result = client.ExtractTables(sourceFile, openOptions, settings);

                if(result != null)
                {
                    // Create the target folder if it does not exist.
                    if (!Directory.Exists(targetFolder))
                    {
                        Directory.CreateDirectory(targetFolder);
                    }
                    Console.WriteLine($"Output to: {targetFolder}");

                    // Get the filename.
                    string filename = result.FileName;
                    Console.WriteLine(filename);
                    // Write the result to a file.
                    File.WriteAllBytes(Path.Combine(targetFolder, filename), result.File);
                }
                else
                {
                    Console.WriteLine("No result returned");
                }
            }
            finally
            {
                if (client != null)
                {
                    CloseService(client);
                }
            }
        }