/java/

Apache pdfbox - pdftoText

2013-07-02 20:36:51

http://pdfbox.apache.org/index.html


<dependency>
    <groupid>org.apache.pdfbox</groupid>
    <artifactid>pdfbox</artifactid>
    <version>1.8.2</version>
</dependency>


import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.util.PDFTextStripper;

import java.io.File;
import java.io.FileInputStream;
import java.io.PrintWriter;

public class App {

    PDFParser parser;
     String parsedText;
     PDFTextStripper pdfStripper;
     PDDocument pdDoc;
     COSDocument cosDoc;
     PDDocumentInformation pdDocInfo;


    public static void main( String[] args ){
        new App().init();
    }

    public void init(){
        String pdfToText = pdftoText("/home/szalek/paka/dane/1_1.pdf");
        writeTexttoFile(pdfToText,"/home/szalek/paka/dane/1_1.text");
    }

    // Extract text from PDF Document
     String pdftoText(String fileName) {

         System.out.println("Parsing text from PDF file " + fileName + "....");
         File f = new File(fileName);

         if (!f.isFile()) {
             System.out.println("File " + fileName + " does not exist.");
             return null;
         }

         try {
             parser = new PDFParser(new FileInputStream(f));
         } catch (Exception e) {
             System.out.println("Unable to open PDF Parser.");
             return null;
         }

         try {
             parser.parse();
             cosDoc = parser.getDocument();
             pdfStripper = new PDFTextStripper();
             pdDoc = new PDDocument(cosDoc);
             parsedText = pdfStripper.getText(pdDoc);
         } catch (Exception e) {
             System.out.println("An exception occured in parsing the PDF Document.");
             e.printStackTrace();
             try {
                    if (cosDoc != null) cosDoc.close();
                    if (pdDoc != null) pdDoc.close();
                } catch (Exception e1) {
                e.printStackTrace();
             }
             return null;
         }
         System.out.println("Done.");
         return parsedText;
    }

     // Write the parsed text from PDF to a file
     void writeTexttoFile(String pdfText, String fileName) {

         System.out.println("\nWriting PDF text to output text file " + fileName + "....");
         try {
             PrintWriter pw = new PrintWriter(fileName);
             pw.print(pdfText);
             pw.close();  
         } catch (Exception e) {
             System.out.println("An exception occured in writing the pdf text to file.");
             e.printStackTrace();
         }
         System.out.println("Done.");
     }
}