PDF Focus.Net and OCR

Get technical support of PDF Focus .Net in C# and VB.Net
FromCustomer
Posts: 3
Joined: Mon May 24, 2021 1:44 pm
Contact:

PDF Focus.Net and OCR

Post by FromCustomer »

Hello.

We previously figured out that the OCR tool you guys have doesn’t work on some PDF files, so we know that doesn’t work.. but do you know why? And why the regular pdf conversion doesn’t work? Is there a solution? Currently for these PDF’s I have automated them to be converted with online-services and it seems to work... though unreliably.

Please, show me any code samples of your OCR.
Oliver
Posts: 27
Joined: Wed Aug 19, 2020 11:59 am
Contact:

Re: PDF Focus.Net and OCR

Post by Oliver »

Hello

Since version 7.0, PDF Focus .Net can work with OCR. To perform OCR we'll use free OCR library by Nicomsoft or Tesseract OCR.
The libraries are freeware and can be used in commercial application.
I'll show you how to use these OCR engines.

NicomSoft OCR & PDF Focus.Net:

Code: Select all

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.IO;
using SautinSoft;
using NSOCR_NameSpace;
using System.Drawing.Imaging;


namespace Sample
{
    public class PdfConverter
    {
        internal NSOCRLib.NSOCRClass NsOCR;
        internal int CfgObj = 0;
        internal int OcrObj = 0;
        internal int ImgObj = 0;
        internal int ScanObj = 0;
        internal int SvrObj = 0;
        internal bool OCRCreated = false;

        /// <summary>
        /// Converts PDF to DOCX, RTF, HTML, Text with OCR engine.
        /// </summary>
        public void ConvertPdfToAllWithOCR(string pdfPath)
        {
            // To perform OCR we'll use free OCR library by Nicomsoft.
            // https://www.nicomsoft.com/products/ocr/download/
            // The library is freeware and can be used in commercial application.
            // Also you have to insert this key:  AB2A4DD5FF2A.
            NsOCR = new NSOCRLib.NSOCRClass();
            NsOCR.Engine_SetLicenseKey("AB2A4DD5FF2A"); //required for licensed version only
            NsOCR.Engine_InitializeAdvanced(out CfgObj, out OcrObj, out ImgObj);

            SautinSoft.PdfFocus f = new SautinSoft.PdfFocus();
            f.OCROptions.Method += PerformOCRNicomsoft;
            f.OCROptions.Mode = PdfFocus.COCROptions.eOCRMode.AllImages;
            f.WordOptions.KeepCharScaleAndSpacing = false;

            string pdfFile = pdfPath;
            string outFile = String.Empty;

            f.OpenPdf(pdfFile);
            if (f.PageCount > 0)
            {
                // To Docx.
                outFile = "Result_Nicom.docx";
                f.WordOptions.Format = PdfFocus.CWordOptions.eWordDocument.Docx;
                if (f.ToWord(outFile) == 0)
                    System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });

                // To HTML.
                outFile = "Result_Nicom.html";
                f.HtmlOptions.KeepCharScaleAndSpacing = false;
                if (f.ToHtml(outFile) == 0)
                    System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });
            }
            else
            {
                Console.WriteLine("Error: {0}!", f.Exception.Message);
                Console.ReadLine();
            }
        }
        public static byte[] PerformOCRNicomsoft(byte[] image)
        {
            NSOCRLib.NSOCRClass NsOCR;
            int CfgObj = 0;
            int OcrObj = 0;
            int ImgObj = 0;
            int SvrObj = 0;

            NsOCR = new NSOCRLib.NSOCRClass();
            NsOCR.Engine_SetLicenseKey("AB2A4DD5FF2A"); //required for licensed version only
            NsOCR.Engine_InitializeAdvanced(out CfgObj, out OcrObj, out ImgObj);

            // Scale
            NsOCR.Cfg_SetOption(CfgObj, TNSOCR.BT_DEFAULT, "ImgAlizer/AutoScale", "0");
            NsOCR.Cfg_SetOption(CfgObj, TNSOCR.BT_DEFAULT, "ImgAlizer/ScaleFactor", "4.0");

            NsOCR.Cfg_SetOption(CfgObj, TNSOCR.BT_DEFAULT, "Languages/English", "1");

            try
            {
                int res = 0;


                Array imgArray = null;
                using (MemoryStream ms = new MemoryStream(image))
                {
                    ms.Flush();
                    imgArray = ms.ToArray();
                }
                res = NsOCR.Img_LoadFromMemory(ImgObj, ref imgArray, imgArray.Length);
                if (res > TNSOCR.ERROR_FIRST)
                    return null;

                NsOCR.Svr_Create(CfgObj, TNSOCR.SVR_FORMAT_PDF, out SvrObj);
                NsOCR.Svr_NewDocument(SvrObj);

                res = NsOCR.Img_OCR(ImgObj, TNSOCR.OCRSTEP_FIRST, TNSOCR.OCRSTEP_LAST, TNSOCR.OCRFLAG_NONE);
                if (res > TNSOCR.ERROR_FIRST)
                    return null;




                res = NsOCR.Svr_AddPage(SvrObj, ImgObj, TNSOCR.FMT_EXACTCOPY);
                if (res > TNSOCR.ERROR_FIRST) return null;

                Array outPdf = null;
                NsOCR.Svr_SaveToMemory(SvrObj, out outPdf);

                return (byte[])outPdf;
            }
            finally
            {

            }
        }
    }
    class Sample
    {
        static void Main(string[] args)
        {
            // To perform OCR we'll use free OCR library by Nicomsoft.
            // https://www.nicomsoft.com/products/ocr/download/
            // The library is freeware and can be used in commercial application.

            PdfConverter converter = new PdfConverter();
            string inpFile = Path.GetFullPath(@"d:\Test.pdf");
            converter.ConvertPdfToAllWithOCR(inpFile);

            // You are trying to compile this code sample and see the errors: 
            // NSOCRClass: Engine_SetLicenseKey
            // PdfFocus: OCROptions
            //
            // 1. Download Nicomsoft OCR SDK from: http://www.nicomsoft.com/files/ocr/free_NSOCR_v70_build885_full.exe
            // 2. Install it on your PC or server-side.
            // 3. Launch code sample again and enjoy! 

            // Please, read the full manual - How to use PDF Focus .Net with OCR (Readme.html)
            // IMPORTANT: PDF Focus .Net supports OCR since version 7.0
        }
    }
}
Tesseract OCR & PDF Focus.Net:

Code: Select all

using System.IO;
using SautinSoft;
using System;

namespace Example
{
    class Program
    {
        static void Main(string[] args)
        {
            // Note: Please rebuild the project to restore Nuget packages.

            LoadScannedPdf();
        }

        /// <summary>
        /// Load a scanned PDF document with help of Tesseract OCR (free OCR library) and save the result as DOCX document.
        /// </summary>
        static void LoadScannedPdf()
        {
            // Here we'll load a scanned PDF document (perform OCR) containing a text on English, Russian and Vietnamese.
            // Next save the OCR result as a new DOCX document.

            // First steps:

            // 1. Download data files for English, Russian and Vietnamese languages.
            // Please download the files: eng.traineddata, rus.traineddata and vie.traineddata.
            // From here (good and fast): https://github.com/tesseract-ocr/tessdata_fast
            // or (best and slow): https://github.com/tesseract-ocr/tessdata_best

            // 2. Copy the files: eng.traineddata, rus.traineddata and vie.traineddata to
            // the folder "tessdata" in the Project root.

            // 3. Be sure that the folder "tessdata" also contains "pdf.ttf" file.

            // Let's start:
            string inpFile = @"d:\Test.pdf";
            string outFile = "Result_tesseract.docx";
            

            PdfFocus f = new PdfFocus();
            f.OCROptions.Mode = PdfFocus.COCROptions.eOCRMode.AllImages;
            f.OCROptions.Method += PerformOCRTesseract;

            f.OpenPdf(inpFile);
            bool result = false;
            if (f.PageCount > 0)
            {
                result = f.ToWord(outFile) == 0;
            }
            // Open the result for demonstration purposes.
            if (result)
            {                
                System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });
            }
            else
                Console.WriteLine("Conversion failed!");
        }
        public static byte[] PerformOCRTesseract(byte[] image)
        {
            // Specify that Tesseract use three 3 languages: English, Russian and Vietnamese.
            //string tesseractLanguages = "rus+eng+vie";
            string tesseractLanguages = "eng";

            // A path to a folder which contains languages data files and font file "pdf.ttf".
            // Language data files can be found here:
            // Good and fast: https://github.com/tesseract-ocr/tessdata_fast
            // or
            // Best and slow: https://github.com/tesseract-ocr/tessdata_best
            // Also this folder must have write permissions.
            string tesseractData = Path.GetFullPath(@"..\..\tessdata\");

            // A path for a temporary PDF file (because Tesseract returns OCR result as PDF document)
            string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());

            bool skipImages = true;

            try
            {
                using (Tesseract.IResultRenderer renderer = Tesseract.PdfResultRenderer.CreatePdfRenderer(tempFile, tesseractData, skipImages))
                {
                    using (renderer.BeginDocument("Serachablepdf"))
                    {
                        using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
                        {
                            engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
                            using (MemoryStream msImg = new MemoryStream(image))
                            {
                                System.Drawing.Image imgWithText = System.Drawing.Image.FromStream(msImg);
                                for (int i = 0; i < imgWithText.GetFrameCount(System.Drawing.Imaging.FrameDimension.Page); i++)
                                {
                                    imgWithText.SelectActiveFrame(System.Drawing.Imaging.FrameDimension.Page, i);
                                    using (MemoryStream ms = new MemoryStream())
                                    {
                                        imgWithText.Save(ms, System.Drawing.Imaging.ImageFormat.Png);
                                        byte[] imgBytes = ms.ToArray();
                                        using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
                                        {
                                            using (var page = engine.Process(img, "Serachablepdf"))
                                            {
                                                renderer.AddPage(page);
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }

                return File.ReadAllBytes(tempFile + ".pdf");
            }
            catch (Exception e)
            {
                Console.WriteLine();
                Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
                Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
                Console.ReadKey();
                throw new Exception("Error Tesseract: " + e.Message);
            }
            finally
            {
                if (File.Exists(tempFile + ".pdf"))
                    File.Delete(tempFile + ".pdf");
            }
        }

    }
}
I've prepared both code samples in one project. Please download it here: www.sautinsoft.com/products/document/files/OCR.ZIP

Thanks
Post Reply

Who is online

Users browsing this forum: No registered users and 1 guest