diff --git a/src/main/java/technology/tabula/ObjectExtractor.java b/src/main/java/technology/tabula/ObjectExtractor.java index 87c2a2f9..bc24768a 100644 --- a/src/main/java/technology/tabula/ObjectExtractor.java +++ b/src/main/java/technology/tabula/ObjectExtractor.java @@ -1,9 +1,11 @@ package technology.tabula; import java.io.IOException; +import java.util.List; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; public class ObjectExtractor { @@ -13,6 +15,11 @@ public ObjectExtractor(PDDocument pdfDocument) { this.pdfDocument = pdfDocument; } + private List getAnnotations(PDPage p) throws IOException { + List annotations = p.getAnnotations(); + return annotations; + } + protected Page extractPage(Integer pageNumber) throws IOException { if (pageNumber > this.pdfDocument.getNumberOfPages() || pageNumber < 1) { @@ -27,7 +34,6 @@ protected Page extractPage(Integer pageNumber) throws IOException { TextStripper pdfTextStripper = new TextStripper(this.pdfDocument, pageNumber); - pdfTextStripper.process(); Utils.sort(pdfTextStripper.textElements, Rectangle.ILL_DEFINED_ORDER); @@ -42,8 +48,18 @@ protected Page extractPage(Integer pageNumber) throws IOException { h = p.getCropBox().getHeight(); } - return new Page(0, 0, w, h, pageRotation, pageNumber, p, pdfTextStripper.textElements, + Page page = new Page(0, 0, w, h, pageRotation, pageNumber, p, pdfTextStripper.textElements, se.rulings, pdfTextStripper.minCharWidth, pdfTextStripper.minCharHeight, pdfTextStripper.spatialIndex); + + for (PDAnnotation ann: this.getAnnotations(p)) { + System.out.println(ann.getSubtype()); + System.out.println(ann.getContents()); + System.out.println(ann.getRectangle()); + System.out.println(ann.getRectangle().toGeneralPath().createTransformedShape(se.getPageTransform()).getBounds2D()); + System.out.println(); + } + + return page; } public PageIterator extract(Iterable pages) { @@ -61,7 +77,4 @@ public Page extract(int pageNumber) { public void close() throws IOException { this.pdfDocument.close(); } - - - } diff --git a/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java b/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java index 700d1fe7..25d40cdb 100644 --- a/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java +++ b/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java @@ -17,6 +17,8 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDRectangle; import org.apache.pdfbox.pdmodel.graphics.image.PDImage; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation; +import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationSquareCircle; import org.apache.pdfbox.util.Matrix; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -24,6 +26,7 @@ class ObjectExtractorStreamEngine extends PDFGraphicsStreamEngine { private static final String NBSP = "\u00A0"; + private final List annotations = new ArrayList<>(); protected List rulings; private AffineTransform pageTransform; @@ -32,9 +35,6 @@ class ObjectExtractorStreamEngine extends PDFGraphicsStreamEngine { private Logger log; private int clipWindingRule = -1; private GeneralPath currentPath = new GeneralPath(); - public List clippingPaths; - - private Matrix translateMatrix; protected ObjectExtractorStreamEngine(PDPage page) { super(page); @@ -59,6 +59,18 @@ protected ObjectExtractorStreamEngine(PDPage page) { } this.pageTransform.translate(-cb.getLowerLeftX(), -cb.getLowerLeftY()); + + // collect annotations + try { + for (PDAnnotation a: page.getAnnotations()) { + if (a.getSubtype().equals("Square")) { + annotations.add(a); + } + } + } catch (IOException e) { + + } + } @Override diff --git a/src/test/java/technology/tabula/TestAnnotationExtraction.java b/src/test/java/technology/tabula/TestAnnotationExtraction.java new file mode 100644 index 00000000..5f28a65c --- /dev/null +++ b/src/test/java/technology/tabula/TestAnnotationExtraction.java @@ -0,0 +1,27 @@ +package technology.tabula; + +import static org.junit.Assert.*; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.junit.Test; + +import java.io.File; +import java.io.IOException; + +public class TestAnnotationExtraction { + + @Test + public void testAnnotationsAreDetected() throws IOException { + PDDocument pdfDocument = PDDocument.load(new File("src/test/resources/technology/tabula/with_annotations.pdf")); + ObjectExtractor oe = new ObjectExtractor(pdfDocument); + PageIterator pi = oe.extract(); + + Page page = pi.next(); + + System.out.println(page); + + + } + + +} \ No newline at end of file diff --git a/src/test/resources/technology/tabula/with_annotations.pdf b/src/test/resources/technology/tabula/with_annotations.pdf new file mode 100644 index 00000000..b3dfe2b6 Binary files /dev/null and b/src/test/resources/technology/tabula/with_annotations.pdf differ