getSpatialIndex() {
+ return spatialIndex;
+ }
+
+ public float getMinCharWidth() {
+ return minCharWidth;
+ }
+
+ public float getMinCharHeight() {
+ return minCharHeight;
+ }
+}
diff --git a/src/main/java/technology/tabula/Utils.java b/src/main/java/technology/tabula/Utils.java
index 35c6cc4d..00814429 100644
--- a/src/main/java/technology/tabula/Utils.java
+++ b/src/main/java/technology/tabula/Utils.java
@@ -280,4 +280,9 @@ public static BufferedImage pageConvertToImage(PDPage page, int dpi, ImageType i
}
}
+ public static BufferedImage pageConvertToImage(PDDocument doc, PDPage page, int dpi, ImageType imageType) throws IOException {
+ PDFRenderer renderer = new PDFRenderer(doc);
+ return renderer.renderImageWithDPI(doc.getPages().indexOf(page), dpi, imageType);
+ }
+
}
diff --git a/src/main/java/technology/tabula/debug/Debug.java b/src/main/java/technology/tabula/debug/Debug.java
index f9f923b5..d6d257ce 100644
--- a/src/main/java/technology/tabula/debug/Debug.java
+++ b/src/main/java/technology/tabula/debug/Debug.java
@@ -16,6 +16,7 @@
import java.util.List;
import org.apache.commons.cli.*;
+import org.apache.pdfbox.Loader;
import technology.tabula.Cell;
import technology.tabula.CommandLineApp;
import technology.tabula.Line;
@@ -215,7 +216,7 @@ public static void renderPage(String pdfPath, String outPath, int pageNumber, Re
boolean drawColumns, boolean drawCharacters, boolean drawArea, boolean drawCells,
boolean drawUnprocessedRulings, boolean drawProjectionProfile, boolean drawClippingPaths,
boolean drawDetectedTables) throws IOException {
- PDDocument document = PDDocument.load(new File(pdfPath));
+ PDDocument document = Loader.loadPDF(new File(pdfPath));
ObjectExtractor oe = new ObjectExtractor(document);
@@ -227,7 +228,7 @@ public static void renderPage(String pdfPath, String outPath, int pageNumber, Re
PDPage p = document.getPage(pageNumber);
- BufferedImage image = Utils.pageConvertToImage(p, 72, ImageType.RGB);
+ BufferedImage image = Utils.pageConvertToImage(document, p, 72, ImageType.RGB);
Graphics2D g = (Graphics2D) image.getGraphics();
@@ -349,7 +350,7 @@ public static void main(String[] args) throws IOException {
if (pages == null) {
// user specified all pages
- PDDocument document = PDDocument.load(pdfFile);
+ PDDocument document = Loader.loadPDF(pdfFile);
int numPages = document.getNumberOfPages();
pages = new ArrayList<>(numPages);
diff --git a/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java b/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java
index 8f155ae3..86639f66 100644
--- a/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java
+++ b/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java
@@ -1,22 +1,8 @@
package technology.tabula.detectors;
-import java.awt.geom.Line2D;
-import java.awt.geom.Point2D;
-import java.awt.image.BufferedImage;
-import java.awt.image.Raster;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.TreeSet;
-
+import org.apache.pdfbox.contentstream.PDContentStream;
import org.apache.pdfbox.contentstream.operator.Operator;
+import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
@@ -24,16 +10,17 @@
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.rendering.ImageType;
-
-import technology.tabula.Line;
-import technology.tabula.Page;
-import technology.tabula.Rectangle;
-import technology.tabula.Ruling;
-import technology.tabula.TextChunk;
-import technology.tabula.TextElement;
-import technology.tabula.Utils;
+import technology.tabula.*;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
+import java.awt.geom.Line2D;
+import java.awt.geom.Point2D;
+import java.awt.image.BufferedImage;
+import java.awt.image.Raster;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.*;
+
/**
* Created by matt on 2015-12-17.
*
@@ -106,7 +93,7 @@ public List detect(Page page) {
BufferedImage image;
PDPage pdfPage = page.getPDPage();
try {
- image = Utils.pageConvertToImage(pdfPage, 144, ImageType.GRAY);
+ image = Utils.pageConvertToImage(page.getPDDoc(), pdfPage, 144, ImageType.GRAY);
} catch (IOException e) {
return new ArrayList<>();
}
@@ -117,7 +104,8 @@ public List detect(Page page) {
PDDocument removeTextDocument = null;
try {
removeTextDocument = this.removeText(pdfPage);
- image = Utils.pageConvertToImage(pdfPage, 144, ImageType.GRAY);
+ pdfPage = removeTextDocument.getPage(0);
+ image = Utils.pageConvertToImage(removeTextDocument, pdfPage, 144, ImageType.GRAY);
} catch (Exception e) {
return new ArrayList<>();
} finally {
@@ -525,132 +513,103 @@ private TextEdges getTextEdges(List lines) {
Map> currMidEdges = new HashMap<>();
Map> currRightEdges = new HashMap<>();
+
+ int numOfLines = lines.size();
for (Line textRow : lines) {
for (TextChunk text : textRow.getTextElements()) {
- Integer left = new Integer((int) Math.floor(text.getLeft()));
- Integer right = new Integer((int) Math.floor(text.getRight()));
- Integer mid = new Integer(left + ((right - left) / 2));
+ Integer left = (int) Math.floor(text.getLeft());
+ Integer right = (int) Math.floor(text.getRight());
+ Integer mid = left + ((right - left) / 2);
// first put this chunk into any edge buckets it belongs to
- List leftEdge = currLeftEdges.get(left);
- if (leftEdge == null) {
- leftEdge = new ArrayList<>();
- currLeftEdges.put(left, leftEdge);
- }
+ List leftEdge = currLeftEdges.computeIfAbsent(left, k -> new ArrayList<>());
leftEdge.add(text);
- List midEdge = currMidEdges.get(mid);
- if (midEdge == null) {
- midEdge = new ArrayList<>();
- currMidEdges.put(mid, midEdge);
- }
+ List midEdge = currMidEdges.computeIfAbsent(mid, k -> new ArrayList<>());
midEdge.add(text);
- List rightEdge = currRightEdges.get(right);
- if (rightEdge == null) {
- rightEdge = new ArrayList<>();
- currRightEdges.put(right, rightEdge);
- }
+ List rightEdge = currRightEdges.computeIfAbsent(right, k -> new ArrayList<>());
rightEdge.add(text);
// now see if this text chunk blows up any other edges
- for (Iterator>> iterator = currLeftEdges.entrySet().iterator(); iterator.hasNext(); ) {
- Map.Entry> entry = iterator.next();
- Integer key = entry.getKey();
- if (key > left && key < right) {
- iterator.remove();
- List edgeChunks = entry.getValue();
- if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) {
- TextChunk first = edgeChunks.get(0);
- TextChunk last = edgeChunks.get(edgeChunks.size() - 1);
-
- TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom());
- edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size());
+ leftTextEdges.addAll(
+ calculateExtendedEdges(numOfLines, currLeftEdges, left, right)
+ );
- leftTextEdges.add(edge);
- }
- }
- }
-
- for (Iterator>> iterator = currMidEdges.entrySet().iterator(); iterator.hasNext(); ) {
- Map.Entry> entry = iterator.next();
- Integer key = entry.getKey();
- if (key > left && key < right && Math.abs(key - mid) > 2) {
- iterator.remove();
- List edgeChunks = entry.getValue();
- if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) {
- TextChunk first = edgeChunks.get(0);
- TextChunk last = edgeChunks.get(edgeChunks.size() - 1);
-
- TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom());
- edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size());
+ midTextEdges.addAll(
+ calculateExtendedEdges(numOfLines, currMidEdges, left, right, mid, 2)
+ );
- midTextEdges.add(edge);
- }
- }
- }
-
- for (Iterator>> iterator = currRightEdges.entrySet().iterator(); iterator.hasNext(); ) {
- Map.Entry> entry = iterator.next();
- Integer key = entry.getKey();
- if (key > left && key < right) {
- iterator.remove();
- List edgeChunks = entry.getValue();
- if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) {
- TextChunk first = edgeChunks.get(0);
- TextChunk last = edgeChunks.get(edgeChunks.size() - 1);
-
- TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom());
- edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size());
-
- rightTextEdges.add(edge);
- }
- }
- }
+ rightTextEdges.addAll(
+ calculateExtendedEdges(numOfLines, currRightEdges, left, right)
+ );
}
}
// add the leftovers
- for (Integer key : currLeftEdges.keySet()) {
- List edgeChunks = currLeftEdges.get(key);
- if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) {
- TextChunk first = edgeChunks.get(0);
- TextChunk last = edgeChunks.get(edgeChunks.size() - 1);
+ leftTextEdges.addAll(
+ calculateLeftoverEdges(numOfLines, currLeftEdges)
+ );
- TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom());
- edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size());
+ midTextEdges.addAll(
+ calculateLeftoverEdges(numOfLines, currMidEdges)
+ );
- leftTextEdges.add(edge);
- }
- }
+ rightTextEdges.addAll(
+ calculateLeftoverEdges(numOfLines, currRightEdges)
+ );
- for (Integer key : currMidEdges.keySet()) {
- List edgeChunks = currMidEdges.get(key);
- if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) {
- TextChunk first = edgeChunks.get(0);
- TextChunk last = edgeChunks.get(edgeChunks.size() - 1);
+ return new TextEdges(leftTextEdges, midTextEdges, rightTextEdges);
+ }
- TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom());
- edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size());
+ private Set calculateLeftoverEdges(int numOfLines, Map> currDirectedEdges) {
+ Set leftoverEdges = new HashSet<>();
+ for (Integer key : currDirectedEdges.keySet()) {
+ List edgeChunks = currDirectedEdges.get(key);
+ if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) {
+ TextEdge edge = getEdgeFromChunks(numOfLines, key, edgeChunks);
- midTextEdges.add(edge);
+ leftoverEdges.add(edge);
}
}
+ return leftoverEdges;
+ }
- for (Integer key : currRightEdges.keySet()) {
- List edgeChunks = currRightEdges.get(key);
- if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) {
- TextChunk first = edgeChunks.get(0);
- TextChunk last = edgeChunks.get(edgeChunks.size() - 1);
+ private TextEdge getEdgeFromChunks(int numOfLines, Integer key, List edgeChunks) {
+ TextChunk first = edgeChunks.get(0);
+ TextChunk last = edgeChunks.get(edgeChunks.size() - 1);
+
+ TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom());
+ edge.intersectingTextRowCount = Math.min(edgeChunks.size(), numOfLines);
+ return edge;
+ }
- TextEdge edge = new TextEdge(key, first.getTop(), key, last.getBottom());
- edge.intersectingTextRowCount = Math.min(edgeChunks.size(), lines.size());
- rightTextEdges.add(edge);
+ private Collection calculateExtendedEdges(Integer numOfLines, Map> currDirectedEdges, Integer left, Integer right) {
+ return calculateExtendedEdges(numOfLines, currDirectedEdges, left, right, null, null);
+ }
+
+ private Collection calculateExtendedEdges(Integer numOfLines, Map> currDirectedEdges, Integer left, Integer right, Integer mid, Integer minDistToMid) {
+ Set extendedEdges = new HashSet<>();
+ Iterator>> edgeIterator = currDirectedEdges.entrySet().iterator();
+ while (edgeIterator.hasNext()) {
+ Map.Entry> entry = edgeIterator.next();
+ Integer key = entry.getKey();
+
+ // if mid and minDistToMid are set, we calculate if the distance to mid is actually above,
+ // otherwise we ignore it
+ boolean hasMinDistToMid = mid == null || minDistToMid == null || Math.abs(key - mid) > minDistToMid;
+
+ if (key > left && key < right && hasMinDistToMid) {
+ edgeIterator.remove();
+ List edgeChunks = entry.getValue();
+ if (edgeChunks.size() >= REQUIRED_TEXT_LINES_FOR_EDGE) {
+ TextEdge edge = getEdgeFromChunks(numOfLines, key, edgeChunks);
+ extendedEdges.add(edge);
+ }
}
}
-
- return new TextEdges(leftTextEdges, midTextEdges, rightTextEdges);
+ return extendedEdges;
}
private List getTableAreasFromCells(List extends Rectangle> cells) {
@@ -664,9 +623,9 @@ private List getTableAreasFromCells(List extends Rectangle> cells)
Point2D[] groupCellCorners = groupCell.getPoints();
Point2D[] candidateCorners = cell.getPoints();
- for (int i = 0; i < candidateCorners.length; i++) {
- for (int j = 0; j < groupCellCorners.length; j++) {
- if (candidateCorners[i].distance(groupCellCorners[j]) < CELL_CORNER_DISTANCE_MAXIMUM) {
+ for (Point2D candidateCorner : candidateCorners) {
+ for (Point2D groupCellCorner : groupCellCorners) {
+ if (candidateCorner.distance(groupCellCorner) < CELL_CORNER_DISTANCE_MAXIMUM) {
cellGroup.add(cell);
addedToGroup = true;
break cellCheck;
@@ -835,37 +794,63 @@ private List getVerticalRulings(BufferedImage image) {
return verticalRulings;
}
-
- // taken from http://www.docjar.com/html/api/org/apache/pdfbox/examples/util/RemoveAllText.java.html
private PDDocument removeText(PDPage page) throws IOException {
PDFStreamParser parser = new PDFStreamParser(page);
parser.parse();
- List