package technology.tabula; import java.awt.geom.Point2D; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.List; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import static java.lang.Float.compare; import static java.util.Collections.min; @SuppressWarnings("serial") // TODO: this class should probably be called "PageArea" or something like that public class Page extends Rectangle { private int number; private Integer rotation; private float minCharWidth; private float minCharHeight; private List textElements; // TODO: Create a class for 'List ' that encapsulates all of these lists and their behaviors? private List rulings, cleanRulings = null, verticalRulingLines = null, horizontalRulingLines = null; private PDPage pdPage; private PDDocument pdDoc; private RectangleSpatialIndex spatialIndex; private static final float DEFAULT_MIN_CHAR_LENGTH = 7; private Page( PageDims pageDims, int rotation, int number, PDPage pdPage, PDDocument doc, List characters, List rulings, float minCharWidth, float minCharHeight, RectangleSpatialIndex index ) { super(pageDims.getTop(), pageDims.getLeft(), pageDims.getWidth(), pageDims.getHeight()); this.rotation = rotation; this.number = number; this.pdPage = pdPage; this.pdDoc = doc; this.textElements = characters; this.rulings = rulings; this.minCharWidth = minCharWidth; this.minCharHeight = minCharHeight; this.spatialIndex = index; } /** * * @deprecated use {@link Builder} instead */ @Deprecated public Page(float top, float left, float width, float height, int rotation, int number, PDPage pdPage, PDDocument doc) { super(top, left, width, height); this.rotation = rotation; this.number = number; this.pdPage = pdPage; this.pdDoc = doc; } /** * * @deprecated use {@link Builder} instead */ public Page(float top, float left, float width, float height, int rotation, int number, PDPage pdPage, PDDocument doc, List characters, List rulings) { this(top, left, width, height, rotation, number, pdPage, doc); this.textElements = characters; this.rulings = rulings; } /** * * @deprecated use {@link Builder} instead */ public Page(float top, float left, float width, float height, int rotation, int number, PDPage pdPage, PDDocument doc, ObjectExtractorStreamEngine streamEngine, TextStripper textStripper) { this(top, left, width, height, rotation, number, pdPage, doc, textStripper.getTextElements(), streamEngine.rulings); this.minCharWidth = textStripper.getMinCharWidth(); this.minCharHeight = textStripper.getMinCharHeight(); this.spatialIndex = textStripper.getSpatialIndex(); } /** * * @deprecated use {@link Builder} instead */ public Page(float top, float left, float width, float height, int rotation, int number, PDPage pdPage, PDDocument doc, List characters, List rulings, float minCharWidth, float minCharHeight, RectangleSpatialIndex index) { this(top, left, width, height, rotation, number, pdPage, doc, characters, rulings); this.minCharHeight = minCharHeight; this.minCharWidth = minCharWidth; this.spatialIndex = index; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // public Page getArea(Rectangle area) { List areaTextElements = getText(area); float minimumCharWidth = getMinimumCharWidthFrom(areaTextElements); float minimumCharHeight = getMinimumCharHeightFrom(areaTextElements); final Page page = Page.Builder.newInstance() .withPageDims(PageDims.of(area.getTop(), area.getLeft(), (float) area.getWidth(), (float) area.getHeight())) .withRotation(rotation) .withNumber(number) .withPdPage(pdPage) .withPdDocument(pdDoc) .withTextElements(areaTextElements) .withRulings(Ruling.cropRulingsToArea(getRulings(), area)) .withMinCharWidth(minimumCharWidth) .withMinCharHeight(minimumCharHeight) .withIndex(spatialIndex) .build(); addBorderRulingsTo(page); return page; } private float getMinimumCharWidthFrom(List areaTextElements) { if (!areaTextElements.isEmpty()) { return min(areaTextElements, (te1, te2) -> compare(te1.width, te2.width)).width; } return DEFAULT_MIN_CHAR_LENGTH; } private float getMinimumCharHeightFrom(List areaTextElements) { if (!areaTextElements.isEmpty()) { return min(areaTextElements, (te1, te2) -> compare(te1.height, te2.height)).height; } return DEFAULT_MIN_CHAR_LENGTH; } private void addBorderRulingsTo(Page page) { Point2D.Double leftTop = new Point2D.Double(page.getLeft(), page.getTop()), rightTop = new Point2D.Double(page.getRight(), page.getTop()), rightBottom = new Point2D.Double(page.getRight(), page.getBottom()), leftBottom = new Point2D.Double(page.getLeft(), page.getBottom()); page.addRuling(new Ruling(leftTop, rightTop)); page.addRuling(new Ruling(rightTop, rightBottom)); page.addRuling(new Ruling(rightBottom, leftBottom)); page.addRuling(new Ruling(leftBottom, leftTop)); } public Page getArea(float top, float left, float bottom, float right) { Rectangle area = new Rectangle(top, left, right - left, bottom - top); return getArea(area); } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // public Integer getRotation() { return rotation; } public int getPageNumber() { return number; } /** * @deprecated with no replacement */ @Deprecated public float getMinCharWidth() { return minCharWidth; } /** * @deprecated with no replacement */ @Deprecated public float getMinCharHeight() { return minCharHeight; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // public List getText() { return textElements; } public List getText(Rectangle area) { return spatialIndex.contains(area); } /** * @deprecated use {@linkplain #getText(Rectangle)} instead */ @Deprecated public List getText(float top, float left, float bottom, float right) { return getText(new Rectangle(top, left, right - left, bottom - top)); } /** * @deprecated use {@linkplain #getText()} instead */ @Deprecated public List getTexts() { return textElements; } /** * Returns the minimum bounding box that contains all the TextElements on this Page */ public Rectangle getTextBounds() { List texts = this.getText(); if (!texts.isEmpty()) { return Utils.bounds(texts); } else { return new Rectangle(); } } /** * @deprecated with no replacement */ @Deprecated public boolean hasText() { return textElements.size() > 0; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // public List getRulings() { if (cleanRulings != null) { return cleanRulings; } if (rulings == null || rulings.isEmpty()) { verticalRulingLines = new ArrayList<>(); horizontalRulingLines = new ArrayList<>(); return new ArrayList<>(); } // TODO: Move as a static method to the Ruling class? Utils.snapPoints(rulings, minCharWidth, minCharHeight); verticalRulingLines = getCollapsedVerticalRulings(); horizontalRulingLines = getCollapsedHorizontalRulings(); cleanRulings = new ArrayList<>(verticalRulingLines); cleanRulings.addAll(horizontalRulingLines); return cleanRulings; } // TODO: Create a class for 'List ' and encapsulate these behaviors within it? private List getCollapsedVerticalRulings() { List verticalRulings = new ArrayList<>(); for (Ruling ruling : rulings) { if (ruling.vertical()) { verticalRulings.add(ruling); } } return Ruling.collapseOrientedRulings(verticalRulings); } private List getCollapsedHorizontalRulings() { List horizontalRulings = new ArrayList<>(); for (Ruling ruling : rulings) { if (ruling.horizontal()) { horizontalRulings.add(ruling); } } return Ruling.collapseOrientedRulings(horizontalRulings); } public List getVerticalRulings() { if (verticalRulingLines != null) { return verticalRulingLines; } getRulings(); return verticalRulingLines; } public List getHorizontalRulings() { if (horizontalRulingLines != null) { return horizontalRulingLines; } getRulings(); return horizontalRulingLines; } public void addRuling(Ruling ruling) { if (ruling.oblique()) { throw new UnsupportedOperationException("Can't add an oblique ruling."); } rulings.add(ruling); // Clear caches: verticalRulingLines = null; horizontalRulingLines = null; cleanRulings = null; } public List getUnprocessedRulings() { return rulings; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // public PDPage getPDPage() { return pdPage; } public PDDocument getPDDoc() { return pdDoc; } // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // /** * @deprecated with no replacement */ @Deprecated public RectangleSpatialIndex getSpatialIndex() { return spatialIndex; } public static class Builder { private PageDims pageDims; private int rotation; private int number; private PDPage pdPage; private PDDocument pdDocument; private List textElements; private List rulings; private float minCharWidth; private float minCharHeight; private RectangleSpatialIndex index; private Builder() {} public static Builder newInstance() { return new Builder(); } public Builder withPageDims(PageDims pageDims) { this.pageDims = pageDims; return this; } public Builder withRotation(int rotation) { this.rotation = rotation; return this; } public Builder withNumber(int number) { this.number = number; return this; } public Builder withPdPage(PDPage pdPage) { this.pdPage = pdPage; return this; } public Builder withPdDocument(PDDocument pdDocument) { this.pdDocument = pdDocument; return this; } public Builder withTextElements(List textElements) { this.textElements = textElements; return this; } public Builder withRulings(List rulings) { this.rulings = rulings; return this; } public Builder withMinCharWidth(float minCharWidth) { this.minCharWidth = minCharWidth; return this; } public Builder withMinCharHeight(float minCharHeight) { this.minCharHeight = minCharHeight; return this; } public Builder withIndex(RectangleSpatialIndex index) { this.index = index; return this; } public Page build() { return new Page(pageDims, rotation, number, pdPage, pdDocument, textElements, rulings, minCharWidth, minCharHeight, index); } } }