table = sea.extract(page);
+ // iterate over the tables of the page
+ for(Table tables: table) {
+ List> rows = tables.getRows();
+ // iterate over the rows of the table
+ for (List cells : rows) {
+ // print all column-cells of the row plus linefeed
+ for (RectangularTextContainer content : cells) {
+ // Note: Cell.getText() uses \r to concat text chunks
+ String text = content.getText().replace("\r", " ");
+ System.out.print(text + "|");
+ }
+ System.out.println();
+ }
+ }
+ }
+}
+```
+
+
+For more detail information check the Javadoc.
+The Javadoc API documentation can be generated (see also '_Building from Source_' section) via
+
+```
+mvn javadoc:javadoc
+```
+
+which generates the HTML files to directory ```target/site/apidocs/```
+
## Building from Source
Clone this repo and run:
@@ -96,7 +143,7 @@ You can help by:
### Backers
-You can also support our continued work on `tabula-java` with a one-time or monthly donation [on OpenCollective](https://opencollective.com/tabulapdf#support). Organizations who use `tabula-java` can also [sponsor the project](https://opencollective.com/tabulapdf#support) for acknolwedgement on [our official site](http://tabula.technology/) and this README.
+You can also support our continued work on `tabula-java` with a one-time or monthly donation [on OpenCollective](https://opencollective.com/tabulapdf#support). Organizations who use `tabula-java` can also [sponsor the project](https://opencollective.com/tabulapdf#support) for acknowledgement on [our official site](http://tabula.technology/) and this README.
Special thanks to the following users and organizations for generously supporting Tabula with donations and grants:
@@ -107,5 +154,5 @@ Special thanks to the following users and organizations for generously supportin
-
+
diff --git a/appveyor.yml b/appveyor.yml
deleted file mode 100644
index f60e8fd5..00000000
--- a/appveyor.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-version: '{build}'
-install:
- - ps: |
- Add-Type -AssemblyName System.IO.Compression.FileSystem
- if (!(Test-Path -Path "C:\maven" )) {
- (new-object System.Net.WebClient).DownloadFile(
- 'http://www.us.apache.org/dist/maven/maven-3/3.5.0/binaries/apache-maven-3.5.0-bin.zip',
- 'C:\maven-bin.zip'
- )
- [System.IO.Compression.ZipFile]::ExtractToDirectory("C:\maven-bin.zip", "C:\maven")
- }
- - cmd: SET PATH=C:\maven\apache-maven-3.2.5\bin;%JAVA_HOME%\bin;%PATH%
- - cmd: SET MAVEN_OPTS=-XX:MaxPermSize=2g -Xmx4g
- - cmd: SET JAVA_OPTS=-XX:MaxPermSize=2g -Xmx4g
-test_script:
- - mvn clean install --batch-mode
-cache:
- - C:\maven\
- - C:\Users\appveyor\.m2
diff --git a/jbang-catalog.json b/jbang-catalog.json
new file mode 100644
index 00000000..b7f71347
--- /dev/null
+++ b/jbang-catalog.json
@@ -0,0 +1,8 @@
+{
+ "catalogs": {},
+ "aliases": {
+ "tabula": {
+ "script-ref": "https://github.com/tabulapdf/tabula-java/releases/download/v1.0.4/tabula-1.0.4-jar-with-dependencies.jar"
+ }
+ }
+}
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 2145aa06..211d0d4d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,8 +1,9 @@
-
+4.0.0technology.tabulatabula
- 1.0.2-SNAPSHOT
+ 1.0.6-SNAPSHOTTabulaExtract tables from PDF fileshttp://github.com/tabulapdf/tabula-java
@@ -32,21 +33,26 @@
+
+
+ snapshots
+ https://repository.apache.org/content/repositories/snapshots/
+
+ false
+
+
+ true
+
+
+
+
scm:git:git@github.com:tabulapdf/tabula-java.gitscm:git:git@github.com:tabulapdf/tabula-java.gitgit@github.com:tabulapdf/tabula-java.git
- tabula-1.0.0-SNAPSHOT
+ v1.0.2
-
-
- sonatype
- Sonatype repository
- https://oss.sonatype.org/content/repositories/snapshots/
-
-
-
UTF-8UTF-8
@@ -68,7 +74,7 @@
org.apache.maven.pluginsmaven-javadoc-plugin
- 2.10.3
+ 3.8.0true
@@ -81,7 +87,7 @@
org.sonatype.pluginsnexus-staging-maven-plugin
- 1.6.3
+ 1.7.0trueossrh
@@ -93,7 +99,7 @@
org.apache.maven.pluginsmaven-source-plugin
- 2.2.1
+ 3.3.1attach-sources
@@ -103,11 +109,13 @@
-
org.apache.maven.pluginsmaven-javadoc-plugin
- 2.9.1
+ 3.8.0
+
+ 8
+ attach-javadocs
@@ -120,7 +128,7 @@
org.apache.maven.pluginsmaven-gpg-plugin
- 1.5
+ 3.2.4sign-artifacts
@@ -128,15 +136,21 @@
sign
+
+
+ --pinentry-mode
+ loopback
+
+ maven-compiler-plugin
- 3.1
+ 3.13.0
- 1.7
- 1.7
+ 1.8
+ 1.8
@@ -146,152 +160,166 @@
technology.tabula.CommandLineApp
-
-
- jar-with-dependencies
-
+
+
+ jar-with-dependencies
+
-
-
- org.apache.maven.plugins
- maven-surefire-plugin
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+ 3.3.1-Xms1024m -Xmx2048m
-
-
-
-
-
-
- release
-
-
+
+
org.apache.maven.plugins
- maven-javadoc-plugin
- 2.9.1
-
-
- attach-javadocs
-
- jar
-
-
-
+ maven-eclipse-plugin
+ 2.10
+
+ true
+ true
+
-
- org.apache.maven.plugins
- maven-source-plugin
- 2.2.1
-
-
- attach-sources
-
- jar-no-fork
-
-
-
-
-
- org.apache.maven.plugins
- maven-gpg-plugin
- 1.5
-
-
- sign-artifacts
- verify
-
- sign
-
-
-
-
-
-
-
-
+
+
+
+
+
+ release
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 3.8.0
+
+ 8
+
+
+
+ attach-javadocs
+
+ jar
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+ 3.3.1
+
+
+ attach-sources
+
+ jar-no-fork
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-gpg-plugin
+ 3.2.4
+
+
+ sign-artifacts
+ verify
+
+ sign
+
+
+
+
+
+
+
+
-
-
- net.sf.jsi
- jsi
- 1.1.0-SNAPSHOT
-
+
+
+ org.locationtech.jts
+ jts-core
+ 1.20.0
+
-
- org.slf4j
- slf4j-api
- 1.7.25
-
+
+ org.slf4j
+ slf4j-api
+ 2.0.13
+
-
- org.slf4j
- slf4j-simple
- 1.7.25
-
+
+ org.slf4j
+ slf4j-simple
+ 2.0.13
+
-
- org.apache.pdfbox
- pdfbox
- 2.0.7
-
+
+ org.apache.pdfbox
+ pdfbox
+ 3.0.4
+
-
- org.bouncycastle
- bcprov-jdk15on
- 1.56
-
+
+ org.bouncycastle
+ bcprov-jdk18on
+ 1.80
+
-
- org.bouncycastle
- bcmail-jdk15on
- 1.56
-
+
+ org.bouncycastle
+ bcmail-jdk18on
+ 1.80
+
-
- junit
- junit
- 4.11
- test
-
+
+ junit
+ junit
+ 4.13.2
+ test
+
-
- commons-cli
- commons-cli
- 1.4
-
+
+ commons-cli
+ commons-cli
+ 1.8.0
+
-
- org.apache.commons
- commons-csv
- 1.4
-
+
+ org.apache.commons
+ commons-csv
+ 1.11.0
+
-
- com.google.code.gson
- gson
- 2.8.0
-
+
+ com.google.code.gson
+ gson
+ 2.11.0
+
-
- com.github.jai-imageio
- jai-imageio-core
- 1.3.1
-
+
+ com.github.jai-imageio
+ jai-imageio-core
+ 1.4.0
+
-
- com.github.jai-imageio
- jai-imageio-jpeg2000
- 1.3.0
-
+
+ com.github.jai-imageio
+ jai-imageio-jpeg2000
+ 1.4.0
+
-
- com.levigo.jbig2
- levigo-jbig2-imageio
- 2.0
-
-
+
+ org.apache.pdfbox
+ jbig2-imageio
+ 3.0.4
+
+
diff --git a/src/main/java/technology/tabula/Cell.java b/src/main/java/technology/tabula/Cell.java
index b7e568db..d02c8c50 100644
--- a/src/main/java/technology/tabula/Cell.java
+++ b/src/main/java/technology/tabula/Cell.java
@@ -1,75 +1,62 @@
package technology.tabula;
import java.awt.geom.Point2D;
-import java.util.ArrayList;
import java.util.Collections;
-import java.util.List;
@SuppressWarnings("serial")
public class Cell extends RectangularTextContainer {
- private boolean spanning;
- private boolean placeholder;
- private List textElements;
-
- public Cell(float top, float left, float width, float height) {
- super(top, left, width, height);
- this.setPlaceholder(false);
- this.setSpanning(false);
- this.setTextElements(new ArrayList());
- }
-
- public Cell(Point2D topLeft, Point2D bottomRight) {
- super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
- this.setPlaceholder(false);
- this.setSpanning(false);
- this.setTextElements(new ArrayList());
- }
-
- @Override
- public String getText(boolean useLineReturns) {
- if (this.textElements.size() == 0) {
- return "";
- }
- StringBuilder sb = new StringBuilder();
- Collections.sort(this.textElements);
- double curTop = this.textElements.get(0).getTop();
- for (TextChunk tc: this.textElements) {
- if (useLineReturns && tc.getTop() > curTop) {
- sb.append('\r');
- }
- sb.append(tc.getText());
- curTop = tc.getTop();
- }
- return sb.toString().trim();
- }
-
- public String getText() {
- return getText(true);
- }
-
- public boolean isSpanning() {
- return spanning;
- }
-
- public void setSpanning(boolean spanning) {
- this.spanning = spanning;
- }
-
- public boolean isPlaceholder() {
- return placeholder;
- }
-
- public void setPlaceholder(boolean placeholder) {
- this.placeholder = placeholder;
- }
-
-
- public List getTextElements() {
- return textElements;
- }
-
- public void setTextElements(List textElements) {
- this.textElements = textElements;
- }
+ public Cell(float top, float left, float width, float height) {
+ super(top, left, width, height);
+ this.setPlaceholder(false);
+ this.setSpanning(false);
+ }
+
+ public Cell(Point2D topLeft, Point2D bottomRight) {
+ super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
+ this.setPlaceholder(false);
+ this.setSpanning(false);
+ }
+
+ private boolean spanning;
+ private boolean placeholder;
+
+ @Override
+ public String getText(boolean useLineReturns) {
+ if (this.textElements.size() == 0) {
+ return "";
+ }
+ StringBuilder sb = new StringBuilder();
+ this.textElements.sort(Rectangle.ILL_DEFINED_ORDER);
+ double curTop = this.textElements.get(0).getTop();
+ for (TextChunk tc : this.textElements) {
+ if (useLineReturns && tc.getTop() > curTop) {
+ sb.append('\r');
+ }
+ sb.append(tc.getText());
+ curTop = tc.getTop();
+ }
+ return sb.toString().trim();
+ }
+
+ @Override
+ public String getText() {
+ return getText(true);
+ }
+
+ public boolean isSpanning() {
+ return spanning;
+ }
+
+ public void setSpanning(boolean spanning) {
+ this.spanning = spanning;
+ }
+
+ public boolean isPlaceholder() {
+ return placeholder;
+ }
+
+ public void setPlaceholder(boolean placeholder) {
+ this.placeholder = placeholder;
+ }
}
diff --git a/src/main/java/technology/tabula/CohenSutherlandClipping.java b/src/main/java/technology/tabula/CohenSutherlandClipping.java
index 5e170ad8..db9153e9 100644
--- a/src/main/java/technology/tabula/CohenSutherlandClipping.java
+++ b/src/main/java/technology/tabula/CohenSutherlandClipping.java
@@ -18,122 +18,124 @@
* Implements the well known Cohen Sutherland line
* clipping algorithm (line against clip rectangle).
*/
-public final class CohenSutherlandClipping
-{
+public final class CohenSutherlandClipping {
+
private double xMin;
private double yMin;
private double xMax;
private double yMax;
+ private static final int INSIDE = 0;
+ private static final int LEFT = 1;
+ private static final int RIGHT = 2;
+ private static final int BOTTOM = 4;
+ private static final int TOP = 8;
+
+ private final static float MINIMUM_DELTA = 0.01f;
+
/**
- * Creates a Cohen Sutherland clipper with clip rect (0, 0, 0, 0).
+ * Creates a Cohen Sutherland clipper with clip window (0, 0, 0, 0).
*/
- public CohenSutherlandClipping() {
- }
+ public CohenSutherlandClipping() {}
/**
- * Creates a Cohen Sutherland clipper with the given clip rectangle.
- * @param clip the clip rectangle to use
+ * Creates a Cohen Sutherland clipper with the given clip window.
+ * @param clipWindow the clip window to use.
*/
- public CohenSutherlandClipping(Rectangle2D clip) {
- setClip(clip);
+ public CohenSutherlandClipping(Rectangle2D clipWindow) {
+ setClip(clipWindow);
}
/**
* Sets the clip rectangle.
- * @param clip the clip rectangle
+ * @param clipWindow the clip window.
*/
- public void setClip(Rectangle2D clip) {
- xMin = clip.getX();
- xMax = xMin + clip.getWidth();
- yMin = clip.getY();
- yMax = yMin + clip.getHeight();
- }
-
- private static final int INSIDE = 0;
- private static final int LEFT = 1;
- private static final int RIGHT = 2;
- private static final int BOTTOM = 4;
- private static final int TOP = 8;
-
- private final int regionCode(double x, double y) {
- int code = x < xMin
- ? LEFT
- : x > xMax
- ? RIGHT
- : INSIDE;
- if (y < yMin) code |= BOTTOM;
- else if (y > yMax) code |= TOP;
- return code;
+ public void setClip(Rectangle2D clipWindow) {
+ xMin = clipWindow.getX();
+ xMax = xMin + clipWindow.getWidth();
+ yMin = clipWindow.getY();
+ yMax = yMin + clipWindow.getHeight();
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
/**
- * Clips a given line against the clip rectangle.
+ * Clips a given line against the clip window.
* The modification (if needed) is done in place.
- * @param line the line to clip
+ * @param line the line to clip.
* @return true if line is clipped, false if line is
- * totally outside the clip rect.
+ * totally outside the clip window.
*/
public boolean clip(Line2D.Float line) {
+ Point point1 = new Point(line.getX1(), line.getY1());
+ Point point2 = new Point(line.getX2(), line.getY2());
+ Point outsidePoint = new Point(0d, 0d);
- double p1x = line.getX1();
- double p1y = line.getY1();
- double p2x = line.getX2();
- double p2y = line.getY2();
+ boolean lineIsVertical = (point1.x == point2.x);
+ double lineSlope = lineIsVertical ? 0d : (point2.y-point1.y)/(point2.x-point1.x);
- double qx = 0d;
- double qy = 0d;
+ while (point1.region != INSIDE || point2.region != INSIDE) {
+ if ((point1.region & point2.region) != 0) return false;
- boolean vertical = p1x == p2x;
+ outsidePoint.region = (point1.region == INSIDE) ? point2.region : point1.region;
- double slope = vertical
- ? 0d
- : (p2y-p1y)/(p2x-p1x);
-
- int c1 = regionCode(p1x, p1y);
- int c2 = regionCode(p2x, p2y);
-
- while (c1 != INSIDE || c2 != INSIDE) {
-
- if ((c1 & c2) != INSIDE)
- return false;
-
- int c = c1 == INSIDE ? c2 : c1;
-
- if ((c & LEFT) != INSIDE) {
- qx = xMin;
- qy = (Utils.feq(qx, p1x) ? 0 : qx-p1x)*slope + p1y;
+ if ((outsidePoint.region & LEFT) != 0) {
+ outsidePoint.x = xMin;
+ outsidePoint.y = delta(outsidePoint.x, point1.x)*lineSlope + point1.y;
}
- else if ((c & RIGHT) != INSIDE) {
- qx = xMax;
- qy = (Utils.feq(qx, p1x) ? 0 : qx-p1x)*slope + p1y;
+ else if ((outsidePoint.region & RIGHT) != 0) {
+ outsidePoint.x = xMax;
+ outsidePoint.y = delta(outsidePoint.x, point1.x)*lineSlope + point1.y;
}
- else if ((c & BOTTOM) != INSIDE) {
- qy = yMin;
- qx = vertical
- ? p1x
- : (Utils.feq(qy, p1y) ? 0 : qy-p1y)/slope + p1x;
+ else if ((outsidePoint.region & BOTTOM) != 0) {
+ outsidePoint.y = yMin;
+ outsidePoint.x = lineIsVertical
+ ? point1.x
+ : delta(outsidePoint.y, point1.y)/lineSlope + point1.x;
}
- else if ((c & TOP) != INSIDE) {
- qy = yMax;
- qx = vertical
- ? p1x
- : (Utils.feq(qy, p1y) ? 0 : qy-p1y)/slope + p1x;
+ else if ((outsidePoint.region & TOP) != 0) {
+ outsidePoint.y = yMax;
+ outsidePoint.x = lineIsVertical
+ ? point1.x
+ : delta(outsidePoint.y, point1.y)/lineSlope + point1.x;
}
- if (c == c1) {
- p1x = qx;
- p1y = qy;
- c1 = regionCode(p1x, p1y);
+ if (outsidePoint.isInTheSameRegionAs(point1)) {
+ point1.setPositionAndRegion(outsidePoint.x, outsidePoint.y);
}
else {
- p2x = qx;
- p2y = qy;
- c2 = regionCode(p2x, p2y);
+ point2.setPositionAndRegion(outsidePoint.x, outsidePoint.y);
}
}
- line.setLine(p1x, p1y, p2x, p2y);
+ line.setLine(point1.x, point1.y, point2.x, point2.y);
return true;
}
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ private static double delta(double value1, double value2) {
+ return (Math.abs(value1 - value2) < MINIMUM_DELTA) ? 0 : (value1 - value2);
+ }
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ class Point {
+ double x, y;
+ int region;
+
+ Point(double x, double y) {
+ setPositionAndRegion(x, y);
+ }
+
+ void setPositionAndRegion(double x, double y) {
+ this.x = x; this.y = y;
+ region = (x < xMin) ? LEFT : (x > xMax) ? RIGHT : INSIDE;
+ if (y < yMin)
+ region |= BOTTOM;
+ else if (y > yMax)
+ region |= TOP;
+ }
+
+ boolean isInTheSameRegionAs(Point otherPoint) {
+ return this.region == otherPoint.region;
+ }
+ }
+
}
-// end of file
\ No newline at end of file
diff --git a/src/main/java/technology/tabula/CommandLineApp.java b/src/main/java/technology/tabula/CommandLineApp.java
index 21df07b5..1b422303 100644
--- a/src/main/java/technology/tabula/CommandLineApp.java
+++ b/src/main/java/technology/tabula/CommandLineApp.java
@@ -15,6 +15,7 @@
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.DefaultParser;
+import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import technology.tabula.detectors.DetectionAlgorithm;
@@ -29,12 +30,17 @@
public class CommandLineApp {
- private static String VERSION = "1.0.2";
- private static String VERSION_STRING = String.format("tabula %s (c) 2012-2017 Manuel Aristarán", VERSION);
+ private static String VERSION = "1.0.6-SNAPSHOT";
+ private static String VERSION_STRING = String.format("tabula %s (c) 2012-2020 Manuel Aristarán", VERSION);
private static String BANNER = "\nTabula helps you extract tables from PDFs\n\n";
+ private static final int RELATIVE_AREA_CALCULATION_MODE = 0;
+ private static final int ABSOLUTE_AREA_CALCULATION_MODE = 1;
+
+
private Appendable defaultOutput;
- private Rectangle pageArea;
+
+ private List> pageAreas;
private List pages;
private OutputFormat outputFormat;
private String password;
@@ -42,7 +48,7 @@ public class CommandLineApp {
public CommandLineApp(Appendable defaultOutput, CommandLine line) throws ParseException {
this.defaultOutput = defaultOutput;
- this.pageArea = CommandLineApp.whichArea(line);
+ this.pageAreas = CommandLineApp.whichAreas(line);
this.pages = CommandLineApp.whichPages(line);
this.outputFormat = CommandLineApp.whichOutputFormat(line);
this.tableExtractor = CommandLineApp.createExtractor(line);
@@ -109,13 +115,17 @@ public boolean accept(File dir, String name) {
});
for (File pdfFile : pdfs) {
- File outputFile = new File(getOutputFilename(pdfFile));
+ File outputFile = new File(getOutputFilename(pdfFile));
+ try {
extractFileInto(pdfFile, outputFile);
+ } catch (ParseException e) {
+ System.err.println("Caught exception while processing file: " + pdfFile.toString());
+ throw e;
+ }
}
}
public void extractFileTables(CommandLine line, File pdfFile) throws ParseException {
- Appendable outFile = this.defaultOutput;
if (!line.hasOption('o')) {
extractFile(pdfFile, this.defaultOutput);
return;
@@ -149,18 +159,32 @@ public void extractFileInto(File pdfFile, File outputFile) throws ParseException
private void extractFile(File pdfFile, Appendable outFile) throws ParseException {
PDDocument pdfDocument = null;
try {
- pdfDocument = this.password == null ? PDDocument.load(pdfFile) : PDDocument.load(pdfFile, this.password);
+ pdfDocument = this.password == null ? Loader.loadPDF(pdfFile) : Loader.loadPDF(pdfFile,password);
PageIterator pageIterator = getPageIterator(pdfDocument);
- List
tables = new ArrayList
();
+ List
tables = new ArrayList<>();
while (pageIterator.hasNext()) {
Page page = pageIterator.next();
- if (pageArea != null) {
- page = page.getArea(pageArea);
+ if (tableExtractor.verticalRulingPositions != null) {
+ for (Float verticalRulingPosition : tableExtractor.verticalRulingPositions) {
+ page.addRuling(new Ruling(0, verticalRulingPosition, 0.0f, (float) page.getHeight()));
+ }
}
- tables.addAll(tableExtractor.extractTables(page));
+ if (pageAreas != null) {
+ for (Pair areaPair : pageAreas) {
+ Rectangle area = areaPair.getRight();
+ if (areaPair.getLeft() == RELATIVE_AREA_CALCULATION_MODE) {
+ area = new Rectangle((float) (area.getTop() / 100 * page.getHeight()),
+ (float) (area.getLeft() / 100 * page.getWidth()), (float) (area.getWidth() / 100 * page.getWidth()),
+ (float) (area.getHeight() / 100 * page.getHeight()));
+ }
+ tables.addAll(tableExtractor.extractTables(page.getArea(area)));
+ }
+ } else {
+ tables.addAll(tableExtractor.extractTables(page));
+ }
}
writeTables(tables, outFile);
} catch (IOException e) {
@@ -200,16 +224,28 @@ private static OutputFormat whichOutputFormat(CommandLine line) throws ParseExce
}
}
- private static Rectangle whichArea(CommandLine line) throws ParseException {
+ private static List> whichAreas(CommandLine line) throws ParseException {
if (!line.hasOption('a')) {
return null;
}
- List f = parseFloatList(line.getOptionValue('a'));
- if (f.size() != 4) {
- throw new ParseException("area parameters must be top,left,bottom,right");
+ String[] optionValues = line.getOptionValues('a');
+
+ List> areaList = new ArrayList>();
+ for (String optionValue : optionValues) {
+ int areaCalculationMode = ABSOLUTE_AREA_CALCULATION_MODE;
+ int startIndex = 0;
+ if (optionValue.startsWith("%")) {
+ startIndex = 1;
+ areaCalculationMode = RELATIVE_AREA_CALCULATION_MODE;
+ }
+ List f = parseFloatList(optionValue.substring(startIndex));
+ if (f.size() != 4) {
+ throw new ParseException("area parameters must be top,left,bottom,right optionally preceded by %");
+ }
+ areaList.add(new Pair(areaCalculationMode, new Rectangle(f.get(0), f.get(1), f.get(3) - f.get(1), f.get(2) - f.get(0))));
}
- return new Rectangle(f.get(0), f.get(1), f.get(3) - f.get(1), f.get(2) - f.get(0));
+ return areaList;
}
private static List whichPages(CommandLine line) throws ParseException {
@@ -224,7 +260,7 @@ private static ExtractionMethod whichExtractionMethod(CommandLine line) {
}
// -n/--no-spreadsheet [deprecated; use -t] or -c/--columns or -g/--guess or -t/--stream
- if (line.hasOption('n') || line.hasOption('c') || line.hasOption('g') || line.hasOption('t')) {
+ if (line.hasOption('n') || line.hasOption('c') || line.hasOption('t')) {
return ExtractionMethod.BASIC;
}
return ExtractionMethod.DECIDE;
@@ -237,8 +273,14 @@ private static TableExtractor createExtractor(CommandLine line) throws ParseExce
extractor.setUseLineReturns(line.hasOption('u'));
if (line.hasOption('c')) {
- extractor.setVerticalRulingPositions(parseFloatList(line.getOptionValue('c')));
+ String optionString = line.getOptionValue('c');
+ if (optionString.startsWith("%")) {
+ extractor.setVerticalRulingPositionsRelative(true);
+ optionString = optionString.substring(1);
+ }
+ extractor.setVerticalRulingPositions(parseFloatList(optionString));
}
+
return extractor;
}
@@ -246,10 +288,10 @@ private static TableExtractor createExtractor(CommandLine line) throws ParseExce
public static List parseFloatList(String option) throws ParseException {
String[] f = option.split(",");
- List rv = new ArrayList();
+ List rv = new ArrayList<>();
try {
- for (int i = 0; i < f.length; i++) {
- rv.add(Float.parseFloat(f[i]));
+ for (final String element : f) {
+ rv.add(Float.parseFloat(element));
}
return rv;
} catch (NumberFormatException e) {
@@ -262,7 +304,6 @@ private static void printHelp() {
formatter.printHelp("tabula", BANNER, buildOptions(), "", true);
}
- @SuppressWarnings("static-access")
public static Options buildOptions() {
Options o = new Options();
@@ -275,7 +316,7 @@ public static Options buildOptions() {
o.addOption("t", "stream", false, "Force PDF to be extracted using stream-mode extraction (if there are no ruling lines separating each cell)");
o.addOption("i", "silent", false, "Suppress all stderr output.");
o.addOption("u", "use-line-returns", false, "Use embedded line returns in cells. (Only in spreadsheet mode.)");
- o.addOption("d", "debug", false, "Print detected table areas instead of processing.");
+ // o.addOption("d", "debug", false, "Print detected table areas instead of processing.");
o.addOption(Option.builder("b")
.longOpt("batch")
.desc("Convert all .pdfs in the provided directory.")
@@ -302,13 +343,18 @@ public static Options buildOptions() {
.build());
o.addOption(Option.builder("c")
.longOpt("columns")
- .desc("X coordinates of column boundaries. Example --columns 10.1,20.2,30.3")
+ .desc("X coordinates of column boundaries. Example --columns 10.1,20.2,30.3. "
+ + "If all values are between 0-100 (inclusive) and preceded by '%', input will be taken as % of actual width of the page. "
+ + "Example: --columns %25,50,80.6")
.hasArg()
.argName("COLUMNS")
.build());
o.addOption(Option.builder("a")
.longOpt("area")
- .desc("Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page")
+ .desc("-a/--area = Portion of the page to analyze. Example: --area 269.875,12.75,790.5,561. "
+ + "Accepts top,left,bottom,right i.e. y1,x1,y2,x2 where all values are in points relative to the top left corner. "
+ + "If all values are between 0-100 (inclusive) and preceded by '%', input will be taken as % of actual height or width of the page. "
+ + "Example: --area %0,0,100,50. To specify multiple areas, -a option should be repeated. Default is entire page")
.hasArg()
.argName("AREA")
.build());
@@ -327,7 +373,10 @@ private static class TableExtractor {
private boolean useLineReturns = false;
private BasicExtractionAlgorithm basicExtractor = new BasicExtractionAlgorithm();
private SpreadsheetExtractionAlgorithm spreadsheetExtractor = new SpreadsheetExtractionAlgorithm();
+
+ private boolean verticalRulingPositionsRelative = false;
private List verticalRulingPositions = null;
+
private ExtractionMethod method = ExtractionMethod.BASIC;
public TableExtractor() {
@@ -337,6 +386,10 @@ public void setVerticalRulingPositions(List positions) {
this.verticalRulingPositions = positions;
}
+ public void setVerticalRulingPositionsRelative(boolean relative) {
+ this.verticalRulingPositionsRelative = relative;
+ }
+
public void setGuess(boolean guess) {
this.guess = guess;
}
@@ -362,7 +415,7 @@ public List
extractTables(Page page) {
case SPREADSHEET:
return extractTablesSpreadsheet(page);
default:
- return new ArrayList
();
+ return new ArrayList<>();
}
}
@@ -372,7 +425,7 @@ public List
extractTablesBasic(Page page) {
// currently we only have a detector that uses spreadsheets to find table areas
DetectionAlgorithm detector = new NurminenDetectionAlgorithm();
List guesses = detector.detect(page);
- List
tables = new ArrayList
();
+ List
tables = new ArrayList<>();
for (Rectangle guessRect : guesses) {
Page guess = page.getArea(guessRect);
@@ -382,14 +435,27 @@ public List
extractTablesBasic(Page page) {
}
if (verticalRulingPositions != null) {
- return basicExtractor.extract(page, verticalRulingPositions);
+ List absoluteRulingPositions;
+
+ if (this.verticalRulingPositionsRelative) {
+ // convert relative to absolute
+ absoluteRulingPositions = new ArrayList<>(verticalRulingPositions.size());
+ for (float relative : this.verticalRulingPositions) {
+ float absolute = (float) (relative / 100.0 * page.getWidth());
+ absoluteRulingPositions.add(absolute);
+ }
+ } else {
+ absoluteRulingPositions = this.verticalRulingPositions;
+ }
+ return basicExtractor.extract(page, absoluteRulingPositions);
}
+
return basicExtractor.extract(page);
}
public List
extractTablesSpreadsheet(Page page) {
// TODO add useLineReturns
- return (List
) spreadsheetExtractor.extract(page);
+ return spreadsheetExtractor.extract(page);
}
}
diff --git a/src/main/java/technology/tabula/HasText.java b/src/main/java/technology/tabula/HasText.java
index 6f375dbc..1a9bda99 100644
--- a/src/main/java/technology/tabula/HasText.java
+++ b/src/main/java/technology/tabula/HasText.java
@@ -1,7 +1,8 @@
package technology.tabula;
public interface HasText {
-
- String getText();
+
+ String getText();
+ String getText(boolean useLineReturns);
}
diff --git a/src/main/java/technology/tabula/Line.java b/src/main/java/technology/tabula/Line.java
index ed2f6895..31d10529 100644
--- a/src/main/java/technology/tabula/Line.java
+++ b/src/main/java/technology/tabula/Line.java
@@ -8,7 +8,7 @@
@SuppressWarnings("serial")
public class Line extends Rectangle {
- List textChunks = new ArrayList();
+ List textChunks = new ArrayList<>();
public static final Character[] WHITE_SPACE_CHARS = { ' ', '\t', '\r', '\n', '\f' };
@@ -52,7 +52,7 @@ public void addTextChunk(TextChunk textChunk) {
public String toString() {
StringBuilder sb = new StringBuilder();
String s = super.toString();
- sb.append(s.substring(0, s.length() - 1));
+ sb.append(s, 0, s.length() - 1);
sb.append(",chunks=");
for (TextChunk te: this.textChunks) {
sb.append("'" + te.getText() + "', ");
diff --git a/src/main/java/technology/tabula/ObjectExtractor.java b/src/main/java/technology/tabula/ObjectExtractor.java
index 2b97a5a8..9f3f6a03 100644
--- a/src/main/java/technology/tabula/ObjectExtractor.java
+++ b/src/main/java/technology/tabula/ObjectExtractor.java
@@ -5,63 +5,69 @@
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
-public class ObjectExtractor {
+public class ObjectExtractor implements java.io.Closeable {
private final PDDocument pdfDocument;
- public ObjectExtractor(PDDocument pdfDocument) throws IOException {
+ public ObjectExtractor(PDDocument pdfDocument) {
this.pdfDocument = pdfDocument;
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
protected Page extractPage(Integer pageNumber) throws IOException {
-
- if (pageNumber > this.pdfDocument.getNumberOfPages() || pageNumber < 1) {
- throw new java.lang.IndexOutOfBoundsException(
- "Page number does not exist");
+ if (pageNumber > pdfDocument.getNumberOfPages() || pageNumber < 1) {
+ throw new java.lang.IndexOutOfBoundsException("Page number does not exist.");
}
+ PDPage page = pdfDocument.getPage(pageNumber - 1);
- PDPage p = this.pdfDocument.getPage(pageNumber - 1);
-
- ObjectExtractorStreamEngine se = new ObjectExtractorStreamEngine(p);
- se.processPage(p);
-
-
- TextStripper pdfTextStripper = new TextStripper(this.pdfDocument, pageNumber);
+ ObjectExtractorStreamEngine streamEngine = new ObjectExtractorStreamEngine(page);
+ streamEngine.processPage(page);
- pdfTextStripper.process();
+ TextStripper textStripper = new TextStripper(pdfDocument, pageNumber);
+ textStripper.process();
- Utils.sort(pdfTextStripper.textElements);
+ Utils.sort(textStripper.getTextElements(), Rectangle.ILL_DEFINED_ORDER);
- float w, h;
- int pageRotation = p.getRotation();
- if (Math.abs(pageRotation) == 90 || Math.abs(pageRotation) == 270) {
- w = p.getCropBox().getHeight();
- h = p.getCropBox().getWidth();
+ float width, height;
+ int rotation = page.getRotation();
+ if (Math.abs(rotation) == 90 || Math.abs(rotation) == 270) {
+ width = page.getCropBox().getHeight();
+ height = page.getCropBox().getWidth();
} else {
- w = p.getCropBox().getWidth();
- h = p.getCropBox().getHeight();
+ width = page.getCropBox().getWidth();
+ height = page.getCropBox().getHeight();
}
- return new Page(0, 0, w, h, pageRotation, pageNumber, p, pdfTextStripper.textElements,
- se.rulings, pdfTextStripper.minCharWidth, pdfTextStripper.minCharHeight, pdfTextStripper.spatialIndex);
+ return Page.Builder.newInstance()
+ .withPageDims(PageDims.of(0, 0, width, height))
+ .withRotation(rotation)
+ .withNumber(pageNumber)
+ .withPdPage(page)
+ .withPdDocument(pdfDocument)
+ .withRulings(streamEngine.rulings)
+ .withTextElements(textStripper.getTextElements())
+ .withMinCharWidth(textStripper.getMinCharWidth())
+ .withMinCharHeight(textStripper.getMinCharHeight())
+ .withIndex(textStripper.getSpatialIndex())
+ .build();
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
public PageIterator extract(Iterable pages) {
return new PageIterator(this, pages);
}
public PageIterator extract() {
- return extract(Utils.range(1, this.pdfDocument.getNumberOfPages() + 1));
+ return extract(Utils.range(1, pdfDocument.getNumberOfPages() + 1));
}
public Page extract(int pageNumber) {
return extract(Utils.range(pageNumber, pageNumber + 1)).next();
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
public void close() throws IOException {
- this.pdfDocument.close();
+ pdfDocument.close();
}
-
-
-
+
}
diff --git a/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java b/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java
index 1538cfa6..9907eca1 100644
--- a/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java
+++ b/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java
@@ -7,7 +7,6 @@
import java.awt.geom.PathIterator;
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
-import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
@@ -17,85 +16,77 @@
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
-import org.apache.pdfbox.util.Matrix;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-class ObjectExtractorStreamEngine extends PDFGraphicsStreamEngine {
+import static java.awt.geom.PathIterator.*;
- private static final String NBSP = "\u00A0";
+class ObjectExtractorStreamEngine extends PDFGraphicsStreamEngine {
protected List rulings;
private AffineTransform pageTransform;
- private boolean debugClippingPaths;
private boolean extractRulingLines = true;
- private Logger log;
+ private Logger logger;
private int clipWindingRule = -1;
private GeneralPath currentPath = new GeneralPath();
- public List clippingPaths;
- private Matrix translateMatrix;
+ private static final float RULING_MINIMUM_LENGTH = 0.01f;
protected ObjectExtractorStreamEngine(PDPage page) {
super(page);
+ logger = LoggerFactory.getLogger(ObjectExtractorStreamEngine.class);
+ rulings = new ArrayList<>();
- this.log = LoggerFactory.getLogger(ObjectExtractorStreamEngine.class);
-
- this.rulings = new ArrayList();
- this.pageTransform = null;
+ // Calculate page transform:
+ pageTransform = new AffineTransform();
+ PDRectangle pageCropBox = getPage().getCropBox();
+ int rotationAngleInDegrees = getPage().getRotation();
- // calculate page transform
- PDRectangle cb = this.getPage().getCropBox();
- int rotation = this.getPage().getRotation();
-
- this.pageTransform = new AffineTransform();
-
- if (Math.abs(rotation) == 90 || Math.abs(rotation) == 270) {
- this.pageTransform = AffineTransform.getRotateInstance(rotation * (Math.PI / 180.0), 0, 0);
- this.pageTransform.concatenate(AffineTransform.getScaleInstance(1, -1));
+ if (Math.abs(rotationAngleInDegrees) == 90 || Math.abs(rotationAngleInDegrees) == 270) {
+ double rotationAngleInRadians = rotationAngleInDegrees * (Math.PI / 180.0);
+ pageTransform = AffineTransform.getRotateInstance(rotationAngleInRadians, 0, 0);
} else {
- this.pageTransform.concatenate(AffineTransform.getTranslateInstance(0, cb.getHeight()));
- this.pageTransform.concatenate(AffineTransform.getScaleInstance(1, -1));
+ double deltaX = 0;
+ double deltaY = pageCropBox.getHeight();
+ pageTransform.concatenate(AffineTransform.getTranslateInstance(deltaX, deltaY));
}
- this.pageTransform.translate(-cb.getLowerLeftX(), -cb.getLowerLeftY());
+ pageTransform.concatenate(AffineTransform.getScaleInstance(1, -1));
+ pageTransform.translate(-pageCropBox.getLowerLeftX(), -pageCropBox.getLowerLeftY());
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
@Override
- public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) throws IOException {
+ public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) {
currentPath.moveTo((float) p0.getX(), (float) p0.getY());
currentPath.lineTo((float) p1.getX(), (float) p1.getY());
currentPath.lineTo((float) p2.getX(), (float) p2.getY());
currentPath.lineTo((float) p3.getX(), (float) p3.getY());
-
currentPath.closePath();
}
@Override
- public void clip(int windingRule) throws IOException {
- // the clipping path will not be updated until the succeeding painting
- // operator is called
+ public void clip(int windingRule) {
+ // The clipping path will not be updated until the succeeding painting
+ // operator is called.
clipWindingRule = windingRule;
}
@Override
- public void closePath() throws IOException {
+ public void closePath() {
currentPath.closePath();
}
@Override
- public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) throws IOException {
+ public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) {
currentPath.curveTo(x1, y1, x2, y2, x3, y3);
}
@Override
- public void drawImage(PDImage arg0) throws IOException {
- // TODO Auto-generated method stub
-
- }
+ public void drawImage(PDImage arg0) {}
@Override
- public void endPath() throws IOException {
+ public void endPath() {
if (clipWindingRule != -1) {
currentPath.setWindingRule(clipWindingRule);
getGraphicsState().intersectClippingPath(currentPath);
@@ -105,170 +96,176 @@ public void endPath() throws IOException {
}
@Override
- public void fillAndStrokePath(int arg0) throws IOException {
+ public void fillAndStrokePath(int arg0) {
strokeOrFillPath(true);
}
@Override
- public void fillPath(int arg0) throws IOException {
+ public void fillPath(int arg0) {
strokeOrFillPath(true);
}
@Override
- public Point2D getCurrentPoint() throws IOException {
+ public Point2D getCurrentPoint() {
return currentPath.getCurrentPoint();
}
@Override
- public void lineTo(float x, float y) throws IOException {
+ public void lineTo(float x, float y) {
currentPath.lineTo(x, y);
}
@Override
- public void moveTo(float x, float y) throws IOException {
+ public void moveTo(float x, float y) {
currentPath.moveTo(x, y);
}
@Override
- public void shadingFill(COSName arg0) throws IOException {
- // TODO Auto-generated method stub
-
- }
+ public void shadingFill(COSName arg0) {}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
@Override
- public void strokePath() throws IOException {
+ public void strokePath() {
strokeOrFillPath(false);
}
private void strokeOrFillPath(boolean isFill) {
- GeneralPath path = this.currentPath;
-
- if (!this.extractRulingLines) {
- this.currentPath.reset();
+ if (!extractRulingLines) {
+ currentPath.reset();
return;
}
- PathIterator pi = path.getPathIterator(this.getPageTransform());
- float[] c = new float[6];
- int currentSegment;
-
- // skip paths whose first operation is not a MOVETO
- // or contains operations other than LINETO, MOVETO or CLOSE
- if ((pi.currentSegment(c) != PathIterator.SEG_MOVETO)) {
- path.reset();
- return;
- }
- pi.next();
- while (!pi.isDone()) {
- currentSegment = pi.currentSegment(c);
- if (currentSegment != PathIterator.SEG_LINETO && currentSegment != PathIterator.SEG_CLOSE
- && currentSegment != PathIterator.SEG_MOVETO) {
- path.reset();
- return;
- }
- pi.next();
- }
+ boolean didNotPassedTheFilter = filterPathBySegmentType();
+ if (didNotPassedTheFilter) return;
// TODO: how to implement color filter?
- // skip the first path operation and save it as the starting position
- float[] first = new float[6];
- pi = path.getPathIterator(this.getPageTransform());
- pi.currentSegment(first);
- // last move
- Point2D.Float start_pos = new Point2D.Float(Utils.round(first[0], 2), Utils.round(first[1], 2));
- Point2D.Float last_move = start_pos;
- Point2D.Float end_pos = null;
+ // Skip the first path operation and save it as the starting point.
+ PathIterator pathIterator = currentPath.getPathIterator(getPageTransform());
+
+ float[] coordinates = new float[6];
+ int currentSegment;
+
+ Point2D.Float startPoint = getStartPoint(pathIterator);
+ Point2D.Float last_move = startPoint;
+ Point2D.Float endPoint = null;
Line2D.Float line;
- PointComparator pc = new PointComparator();
- while (!pi.isDone()) {
- pi.next();
- // This can be the last segment, when pi.isDone, but we need to
- // process it
- // otherwise us-017.pdf fails the last value.
+ PointComparator pointComparator = new PointComparator();
+
+ while (!pathIterator.isDone()) {
+ pathIterator.next();
+ // This can be the last segment, when pathIterator.isDone, but we need to
+ // process it otherwise us-017.pdf fails the last value.
try {
- currentSegment = pi.currentSegment(c);
+ currentSegment = pathIterator.currentSegment(coordinates);
} catch (IndexOutOfBoundsException ex) {
continue;
}
switch (currentSegment) {
- case PathIterator.SEG_LINETO:
- end_pos = new Point2D.Float(c[0], c[1]);
-
- line = pc.compare(start_pos, end_pos) == -1 ? new Line2D.Float(start_pos, end_pos)
- : new Line2D.Float(end_pos, start_pos);
-
- if (line.intersects(this.currentClippingPath())) {
- Ruling r = new Ruling(line.getP1(), line.getP2()).intersect(this.currentClippingPath());
-
- if (r.length() > 0.01) {
- this.rulings.add(r);
- }
+ case SEG_LINETO:
+ endPoint = new Point2D.Float(coordinates[0], coordinates[1]);
+ if (startPoint == null || endPoint == null) {
+ break;
}
+ line = getLineBetween(startPoint, endPoint, pointComparator);
+ verifyLineIntersectsClipping(line);
break;
- case PathIterator.SEG_MOVETO:
- last_move = new Point2D.Float(c[0], c[1]);
- end_pos = last_move;
+ case SEG_MOVETO:
+ last_move = new Point2D.Float(coordinates[0], coordinates[1]);
+ endPoint = last_move;
break;
- case PathIterator.SEG_CLOSE:
- // according to PathIterator docs:
- // "the preceding subpath should be closed by appending a line
- // segment
- // back to the point corresponding to the most recent
+ case SEG_CLOSE:
+ // According to PathIterator docs:
+ // "The preceding sub-path should be closed by appending a line
+ // segment back to the point corresponding to the most recent
// SEG_MOVETO."
- line = pc.compare(end_pos, last_move) == -1 ? new Line2D.Float(end_pos, last_move)
- : new Line2D.Float(last_move, end_pos);
-
- if (line.intersects(this.currentClippingPath())) {
- Ruling r = new Ruling(line.getP1(), line.getP2()).intersect(this.currentClippingPath());
-
- if (r.length() > 0.01) {
- this.rulings.add(r);
- }
+ if (startPoint == null || endPoint == null) {
+ break;
}
+ line = getLineBetween(endPoint, last_move, pointComparator);
+ verifyLineIntersectsClipping(line);
break;
}
- start_pos = end_pos;
+ startPoint = endPoint;
}
- path.reset();
+ currentPath.reset();
}
- public AffineTransform getPageTransform() {
- return this.pageTransform;
+ private boolean filterPathBySegmentType() {
+ PathIterator pathIterator = currentPath.getPathIterator(pageTransform);
+ float[] coordinates = new float[6];
+ int currentSegmentType = pathIterator.currentSegment(coordinates);
+ if (currentSegmentType != SEG_MOVETO) {
+ currentPath.reset();
+ return true;
+ }
+ pathIterator.next();
+ while (!pathIterator.isDone()) {
+ currentSegmentType = pathIterator.currentSegment(coordinates);
+ if (currentSegmentType != SEG_LINETO && currentSegmentType != SEG_CLOSE && currentSegmentType != SEG_MOVETO) {
+ currentPath.reset();
+ return true;
+ }
+ pathIterator.next();
+ }
+ return false;
}
- public Rectangle2D currentClippingPath() {
- Shape clippingPath = this.getGraphicsState().getCurrentClippingPath();
- Shape transformedClippingPath = this.getPageTransform().createTransformedShape(clippingPath);
+ private Point2D.Float getStartPoint(PathIterator pathIterator) {
+ float[] startPointCoordinates = new float[6];
+ pathIterator.currentSegment(startPointCoordinates);
+ float x = Utils.round(startPointCoordinates[0], 2);
+ float y = Utils.round(startPointCoordinates[1], 2);
+ return new Point2D.Float(x, y);
+ }
- return transformedClippingPath.getBounds2D();
+ private Line2D.Float getLineBetween(Point2D.Float pointA, Point2D.Float pointB, PointComparator pointComparator) {
+ if (pointComparator.compare(pointA, pointB) == -1) {
+ return new Line2D.Float(pointA, pointB);
+ }
+ return new Line2D.Float(pointB, pointA);
+ }
+
+ private void verifyLineIntersectsClipping(Line2D.Float line) {
+ Rectangle2D currentClippingPath = currentClippingPath();
+ if (line.intersects(currentClippingPath)) {
+ Ruling ruling = new Ruling(line.getP1(), line.getP2()).intersect(currentClippingPath);
+ if (ruling.length() > RULING_MINIMUM_LENGTH) {
+ rulings.add(ruling);
+ }
+ }
}
- public boolean isDebugClippingPaths() {
- return debugClippingPaths;
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ public AffineTransform getPageTransform() {
+ return pageTransform;
}
- public void setDebugClippingPaths(boolean debugClippingPaths) {
- this.debugClippingPaths = debugClippingPaths;
+ public Rectangle2D currentClippingPath() {
+ Shape currentClippingPath = getGraphicsState().getCurrentClippingPath();
+ Shape transformedClippingPath = getPageTransform().createTransformedShape(currentClippingPath);
+ return transformedClippingPath.getBounds2D();
}
+ // TODO: repeated in SpreadsheetExtractionAlgorithm.
class PointComparator implements Comparator {
@Override
- public int compare(Point2D o1, Point2D o2) {
- float o1X = Utils.round(o1.getX(), 2);
- float o1Y = Utils.round(o1.getY(), 2);
- float o2X = Utils.round(o2.getX(), 2);
- float o2Y = Utils.round(o2.getY(), 2);
+ public int compare(Point2D p1, Point2D p2) {
+ float p1X = Utils.round(p1.getX(), 2);
+ float p1Y = Utils.round(p1.getY(), 2);
+ float p2X = Utils.round(p2.getX(), 2);
+ float p2Y = Utils.round(p2.getY(), 2);
- if (o1Y > o2Y)
+ if (p1Y > p2Y)
return 1;
- if (o1Y < o2Y)
+ if (p1Y < p2Y)
return -1;
- if (o1X > o2X)
+ if (p1X > p2X)
return 1;
- if (o1X < o2X)
+ if (p1X < p2X)
return -1;
return 0;
}
}
+
}
diff --git a/src/main/java/technology/tabula/Page.java b/src/main/java/technology/tabula/Page.java
index 8177921b..ed74d14a 100644
--- a/src/main/java/technology/tabula/Page.java
+++ b/src/main/java/technology/tabula/Page.java
@@ -6,132 +6,216 @@
import java.util.Comparator;
import java.util.List;
+import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
+import static java.lang.Float.compare;
+import static java.util.Collections.min;
+
@SuppressWarnings("serial")
// TODO: this class should probably be called "PageArea" or something like that
public class Page extends Rectangle {
+ private int number;
private Integer rotation;
- private int pageNumber;
- private List texts;
- private List rulings, cleanRulings = null, verticalRulingLines = null, horizontalRulingLines = null;
private float minCharWidth;
private float minCharHeight;
- private RectangleSpatialIndex spatial_index;
+
+ private List textElements;
+
+ // TODO: Create a class for 'List ' that encapsulates all of these lists and their behaviors?
+ private List rulings,
+ cleanRulings = null,
+ verticalRulingLines = null,
+ horizontalRulingLines = null;
+
private PDPage pdPage;
+ private PDDocument pdDoc;
- public Page(float top, float left, float width, float height, int rotation, int page_number, PDPage pdPage) {
- super(top, left, width, height);
+ private RectangleSpatialIndex spatialIndex;
+
+ private static final float DEFAULT_MIN_CHAR_LENGTH = 7;
+
+ private Page(
+ PageDims pageDims,
+ int rotation,
+ int number,
+ PDPage pdPage,
+ PDDocument doc,
+ List characters,
+ List rulings,
+ float minCharWidth,
+ float minCharHeight,
+ RectangleSpatialIndex index
+ ) {
+ super(pageDims.getTop(), pageDims.getLeft(), pageDims.getWidth(), pageDims.getHeight());
this.rotation = rotation;
- this.pageNumber = page_number;
+ this.number = number;
this.pdPage = pdPage;
+ this.pdDoc = doc;
+ this.textElements = characters;
+ this.rulings = rulings;
+ this.minCharWidth = minCharWidth;
+ this.minCharHeight = minCharHeight;
+ this.spatialIndex = index;
}
-
- public Page(float top, float left, float width, float height, int rotation, int page_number, PDPage pdPage,
- List characters, List rulings) {
- this(top, left, width, height, rotation, page_number, pdPage);
- this.texts = characters;
- this.rulings = rulings;
+ /**
+ *
+ * @deprecated use {@link Builder} instead
+ */
+ @Deprecated
+ public Page(float top, float left, float width, float height, int rotation, int number, PDPage pdPage, PDDocument doc) {
+ super(top, left, width, height);
+ this.rotation = rotation;
+ this.number = number;
+ this.pdPage = pdPage;
+ this.pdDoc = doc;
+ }
+
+ /**
+ *
+ * @deprecated use {@link Builder} instead
+ */
+ public Page(float top, float left, float width, float height, int rotation, int number, PDPage pdPage, PDDocument doc,
+ List characters, List rulings) {
+ this(top, left, width, height, rotation, number, pdPage, doc);
+ this.textElements = characters;
+ this.rulings = rulings;
}
+ /**
+ *
+ * @deprecated use {@link Builder} instead
+ */
+ public Page(float top, float left, float width, float height, int rotation, int number, PDPage pdPage, PDDocument doc,
+ ObjectExtractorStreamEngine streamEngine, TextStripper textStripper) {
+ this(top, left, width, height, rotation, number, pdPage, doc, textStripper.getTextElements(), streamEngine.rulings);
+ this.minCharWidth = textStripper.getMinCharWidth();
+ this.minCharHeight = textStripper.getMinCharHeight();
+ this.spatialIndex = textStripper.getSpatialIndex();
+ }
- public Page(float top, float left, float width, float height, int rotation, int page_number, PDPage pdPage,
- List characters, List rulings,
- float minCharWidth, float minCharHeight, RectangleSpatialIndex index) {
- this(top, left, width, height, rotation, page_number, pdPage, characters, rulings);
- this.minCharHeight = minCharHeight;
- this.minCharWidth = minCharWidth;
- this.spatial_index = index;
+
+ /**
+ *
+ * @deprecated use {@link Builder} instead
+ */
+ public Page(float top, float left, float width, float height, int rotation, int number, PDPage pdPage, PDDocument doc,
+ List characters, List rulings,
+ float minCharWidth, float minCharHeight, RectangleSpatialIndex index) {
+ this(top, left, width, height, rotation, number, pdPage, doc, characters, rulings);
+ this.minCharHeight = minCharHeight;
+ this.minCharWidth = minCharWidth;
+ this.spatialIndex = index;
}
-
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
public Page getArea(Rectangle area) {
- List t = getText(area);
- float min_char_width = 7;
- float min_char_height = 7;
-
- if(t.size() > 0){
- min_char_width = Collections.min(t, new Comparator() {
- @Override
- public int compare(TextElement te1, TextElement te2) {
- return java.lang.Float.compare(te1.width, te2.width);
- }}).width;
- min_char_height = Collections.min(t, new Comparator() {
- @Override
- public int compare(TextElement te1, TextElement te2) {
- return java.lang.Float.compare(te1.height, te2.height);
- }}).height;
- }
- Page rv = new Page(
- (float) area.getTop(),
- (float) area.getLeft(),
- (float) area.getWidth(),
- (float) area.getHeight(),
- rotation,
- pageNumber,
- pdPage,
- t,
- Ruling.cropRulingsToArea(getRulings(), area),
- min_char_width,
- min_char_height,
- spatial_index);
-
- rv.addRuling(new Ruling(
- new Point2D.Double(rv.getLeft(),
- rv.getTop()),
- new Point2D.Double(rv.getRight(),
- rv.getTop())));
- rv.addRuling(new Ruling(
- new Point2D.Double(rv.getRight(),
- rv.getTop()),
- new Point2D.Double(rv.getRight(),
- rv.getBottom())));
- rv.addRuling(new Ruling(
- new Point2D.Double(rv.getRight(),
- rv.getBottom()),
- new Point2D.Double(rv.getLeft(),
- rv.getBottom())));
- rv.addRuling(new Ruling(
- new Point2D.Double(rv.getLeft(),
- rv.getBottom()),
- new Point2D.Double(rv.getLeft(),
- rv.getTop())));
-
- return rv;
- }
-
- public Page getArea(float top, float left, float bottom, float right) {
- Rectangle area = new Rectangle(top, left, right - left, bottom - top);
- return this.getArea(area);
+ List areaTextElements = getText(area);
+
+ float minimumCharWidth = getMinimumCharWidthFrom(areaTextElements);
+ float minimumCharHeight = getMinimumCharHeightFrom(areaTextElements);
+
+ final Page page = Page.Builder.newInstance()
+ .withPageDims(PageDims.of(area.getTop(), area.getLeft(), (float) area.getWidth(), (float) area.getHeight()))
+ .withRotation(rotation)
+ .withNumber(number)
+ .withPdPage(pdPage)
+ .withPdDocument(pdDoc)
+ .withTextElements(areaTextElements)
+ .withRulings(Ruling.cropRulingsToArea(getRulings(), area))
+ .withMinCharWidth(minimumCharWidth)
+ .withMinCharHeight(minimumCharHeight)
+ .withIndex(spatialIndex)
+ .build();
+
+ addBorderRulingsTo(page);
+
+ return page;
}
-
- public List getText() {
- return texts;
+
+ private float getMinimumCharWidthFrom(List areaTextElements) {
+ if (!areaTextElements.isEmpty()) {
+ return min(areaTextElements, (te1, te2) -> compare(te1.width, te2.width)).width;
+ }
+ return DEFAULT_MIN_CHAR_LENGTH;
}
-
- public List getText(Rectangle area) {
- return this.spatial_index.contains(area);
+
+ private float getMinimumCharHeightFrom(List areaTextElements) {
+ if (!areaTextElements.isEmpty()) {
+ return min(areaTextElements, (te1, te2) -> compare(te1.height, te2.height)).height;
+ }
+ return DEFAULT_MIN_CHAR_LENGTH;
}
-
- public List getText(float top, float left, float bottom, float right) {
- return this.getText(new Rectangle(top, left, right - left, bottom - top));
+
+ private void addBorderRulingsTo(Page page) {
+ Point2D.Double leftTop = new Point2D.Double(page.getLeft(), page.getTop()),
+ rightTop = new Point2D.Double(page.getRight(), page.getTop()),
+ rightBottom = new Point2D.Double(page.getRight(), page.getBottom()),
+ leftBottom = new Point2D.Double(page.getLeft(), page.getBottom());
+ page.addRuling(new Ruling(leftTop, rightTop));
+ page.addRuling(new Ruling(rightTop, rightBottom));
+ page.addRuling(new Ruling(rightBottom, leftBottom));
+ page.addRuling(new Ruling(leftBottom, leftTop));
+ }
+
+ public Page getArea(float top, float left, float bottom, float right) {
+ Rectangle area = new Rectangle(top, left, right - left, bottom - top);
+ return getArea(area);
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
public Integer getRotation() {
return rotation;
}
public int getPageNumber() {
- return pageNumber;
+ return number;
+ }
+
+ /**
+ * @deprecated with no replacement
+ */
+ @Deprecated
+ public float getMinCharWidth() {
+ return minCharWidth;
+ }
+
+ /**
+ * @deprecated with no replacement
+ */
+ @Deprecated
+ public float getMinCharHeight() {
+ return minCharHeight;
+ }
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ public List getText() {
+ return textElements;
+ }
+
+ public List getText(Rectangle area) {
+ return spatialIndex.contains(area);
+ }
+
+ /**
+ * @deprecated use {@linkplain #getText(Rectangle)} instead
+ */
+ @Deprecated
+ public List getText(float top, float left, float bottom, float right) {
+ return getText(new Rectangle(top, left, right - left, bottom - top));
}
+ /**
+ * @deprecated use {@linkplain #getText()} instead
+ */
+ @Deprecated
public List getTexts() {
- return texts;
+ return textElements;
}
-
+
/**
* Returns the minimum bounding box that contains all the TextElements on this Page
*/
@@ -139,99 +223,194 @@ public Rectangle getTextBounds() {
List texts = this.getText();
if (!texts.isEmpty()) {
return Utils.bounds(texts);
- }
- else {
+ } else {
return new Rectangle();
}
-
}
+ /**
+ * @deprecated with no replacement
+ */
+ @Deprecated
+ public boolean hasText() {
+ return textElements.size() > 0;
+ }
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
public List getRulings() {
- if (this.cleanRulings != null) {
- return this.cleanRulings;
- }
-
- if (this.rulings == null || this.rulings.isEmpty()) {
- this.verticalRulingLines = new ArrayList();
- this.horizontalRulingLines = new ArrayList();
- return new ArrayList();
- }
-
- Utils.snapPoints(this.rulings, this.minCharWidth, this.minCharHeight);
-
- List vrs = new ArrayList();
- for (Ruling vr: this.rulings) {
- if (vr.vertical()) {
- vrs.add(vr);
+ if (cleanRulings != null) {
+ return cleanRulings;
+ }
+
+ if (rulings == null || rulings.isEmpty()) {
+ verticalRulingLines = new ArrayList<>();
+ horizontalRulingLines = new ArrayList<>();
+ return new ArrayList<>();
+ }
+
+ // TODO: Move as a static method to the Ruling class?
+ Utils.snapPoints(rulings, minCharWidth, minCharHeight);
+
+ verticalRulingLines = getCollapsedVerticalRulings();
+ horizontalRulingLines = getCollapsedHorizontalRulings();
+
+ cleanRulings = new ArrayList<>(verticalRulingLines);
+ cleanRulings.addAll(horizontalRulingLines);
+
+ return cleanRulings;
+ }
+
+ // TODO: Create a class for 'List ' and encapsulate these behaviors within it?
+ private List getCollapsedVerticalRulings() {
+ List verticalRulings = new ArrayList<>();
+ for (Ruling ruling : rulings) {
+ if (ruling.vertical()) {
+ verticalRulings.add(ruling);
}
}
- this.verticalRulingLines = Ruling.collapseOrientedRulings(vrs);
-
- List hrs = new ArrayList();
- for (Ruling hr: this.rulings) {
- if (hr.horizontal()) {
- hrs.add(hr);
+ return Ruling.collapseOrientedRulings(verticalRulings);
+ }
+
+ private List getCollapsedHorizontalRulings() {
+ List horizontalRulings = new ArrayList<>();
+ for (Ruling ruling : rulings) {
+ if (ruling.horizontal()) {
+ horizontalRulings.add(ruling);
}
}
- this.horizontalRulingLines = Ruling.collapseOrientedRulings(hrs);
-
- this.cleanRulings = new ArrayList(this.verticalRulingLines);
- this.cleanRulings.addAll(this.horizontalRulingLines);
-
- return this.cleanRulings;
-
+ return Ruling.collapseOrientedRulings(horizontalRulings);
}
-
+
public List getVerticalRulings() {
- if (this.verticalRulingLines != null) {
- return this.verticalRulingLines;
+ if (verticalRulingLines != null) {
+ return verticalRulingLines;
}
- this.getRulings();
- return this.verticalRulingLines;
+ getRulings();
+ return verticalRulingLines;
}
-
+
public List getHorizontalRulings() {
- if (this.horizontalRulingLines != null) {
- return this.horizontalRulingLines;
+ if (horizontalRulingLines != null) {
+ return horizontalRulingLines;
}
- this.getRulings();
- return this.horizontalRulingLines;
+ getRulings();
+ return horizontalRulingLines;
}
-
- public void addRuling(Ruling r) {
- if (r.oblique()) {
- throw new UnsupportedOperationException("Can't add an oblique ruling");
+
+ public void addRuling(Ruling ruling) {
+ if (ruling.oblique()) {
+ throw new UnsupportedOperationException("Can't add an oblique ruling.");
}
- this.rulings.add(r);
- // clear caches
- this.verticalRulingLines = null;
- this.horizontalRulingLines = null;
- this.cleanRulings = null;
+ rulings.add(ruling);
+ // Clear caches:
+ verticalRulingLines = null;
+ horizontalRulingLines = null;
+ cleanRulings = null;
}
-
+
public List getUnprocessedRulings() {
- return this.rulings;
+ return rulings;
}
- public float getMinCharWidth() {
- return minCharWidth;
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ public PDPage getPDPage() {
+ return pdPage;
}
- public float getMinCharHeight() {
- return minCharHeight;
+ public PDDocument getPDDoc() {
+ return pdDoc;
}
- public PDPage getPDPage() {
- return pdPage;
- }
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ /**
+ * @deprecated with no replacement
+ */
+ @Deprecated
public RectangleSpatialIndex getSpatialIndex() {
- return this.spatial_index;
+ return spatialIndex;
}
-
- public boolean hasText() {
- return this.texts.size() > 0;
+
+ public static class Builder {
+ private PageDims pageDims;
+ private int rotation;
+ private int number;
+ private PDPage pdPage;
+ private PDDocument pdDocument;
+ private List textElements;
+ private List rulings;
+ private float minCharWidth;
+ private float minCharHeight;
+ private RectangleSpatialIndex index;
+
+ private Builder() {}
+
+ public static Builder newInstance() {
+ return new Builder();
+ }
+
+ public Builder withPageDims(PageDims pageDims) {
+ this.pageDims = pageDims;
+
+ return this;
+ }
+
+ public Builder withRotation(int rotation) {
+ this.rotation = rotation;
+
+ return this;
+ }
+
+ public Builder withNumber(int number) {
+ this.number = number;
+
+ return this;
+ }
+
+ public Builder withPdPage(PDPage pdPage) {
+ this.pdPage = pdPage;
+
+ return this;
+ }
+
+ public Builder withPdDocument(PDDocument pdDocument) {
+ this.pdDocument = pdDocument;
+
+ return this;
+ }
+
+ public Builder withTextElements(List textElements) {
+ this.textElements = textElements;
+
+ return this;
+ }
+
+ public Builder withRulings(List rulings) {
+ this.rulings = rulings;
+
+ return this;
+ }
+
+ public Builder withMinCharWidth(float minCharWidth) {
+ this.minCharWidth = minCharWidth;
+
+ return this;
+ }
+
+ public Builder withMinCharHeight(float minCharHeight) {
+ this.minCharHeight = minCharHeight;
+
+ return this;
+ }
+
+ public Builder withIndex(RectangleSpatialIndex index) {
+ this.index = index;
+
+ return this;
+ }
+
+ public Page build() {
+ return new Page(pageDims, rotation, number, pdPage, pdDocument, textElements, rulings, minCharWidth, minCharHeight, index);
+ }
}
-
-
}
diff --git a/src/main/java/technology/tabula/PageDims.java b/src/main/java/technology/tabula/PageDims.java
new file mode 100644
index 00000000..1598d125
--- /dev/null
+++ b/src/main/java/technology/tabula/PageDims.java
@@ -0,0 +1,35 @@
+package technology.tabula;
+
+public class PageDims {
+ private final float top;
+ private final float left;
+ private final float width;
+ private final float height;
+
+ private PageDims(final float top, final float left, final float width, final float height) {
+ this.top = top;
+ this.left = left;
+ this.width = width;
+ this.height = height;
+ }
+
+ public static PageDims of(final float top, final float left, final float width, final float height) {
+ return new PageDims(top, left, width, height);
+ }
+
+ public float getTop() {
+ return top;
+ }
+
+ public float getLeft() {
+ return left;
+ }
+
+ public float getWidth() {
+ return width;
+ }
+
+ public float getHeight() {
+ return height;
+ }
+}
diff --git a/src/main/java/technology/tabula/PageIterator.java b/src/main/java/technology/tabula/PageIterator.java
index 5fec2a77..052ed54a 100644
--- a/src/main/java/technology/tabula/PageIterator.java
+++ b/src/main/java/technology/tabula/PageIterator.java
@@ -5,39 +5,39 @@
public class PageIterator implements Iterator {
- private ObjectExtractor oe;
+ private ObjectExtractor objectExtractor;
private Iterator pageIndexIterator;
-
- public PageIterator(ObjectExtractor oe, Iterable pages) {
+
+ public PageIterator(ObjectExtractor objectExtractor, Iterable pages) {
super();
- this.oe = oe;
+ this.objectExtractor = objectExtractor;
this.pageIndexIterator = pages.iterator();
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
@Override
public boolean hasNext() {
- return this.pageIndexIterator.hasNext();
+ return pageIndexIterator.hasNext();
}
@Override
public Page next() {
- Page page = null;
+ Page nextPage = null;
if (!this.hasNext()) {
throw new IllegalStateException();
}
try {
- page = oe.extractPage(this.pageIndexIterator.next());
+ nextPage = objectExtractor.extractPage(pageIndexIterator.next());
} catch (IOException e) {
- // TODO Auto-generated catch block
e.printStackTrace();
}
- return page;
+ return nextPage;
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
@Override
public void remove() {
throw new UnsupportedOperationException();
-
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/technology/tabula/Pair.java b/src/main/java/technology/tabula/Pair.java
new file mode 100644
index 00000000..d54cbbe5
--- /dev/null
+++ b/src/main/java/technology/tabula/Pair.java
@@ -0,0 +1,19 @@
+package technology.tabula;
+
+public class Pair {
+ private final L left;
+ private final R right;
+
+ public Pair(L left, R right) {
+ this.left = left;
+ this.right = right;
+ }
+
+ public L getLeft() {
+ return this.left;
+ }
+
+ public R getRight() {
+ return this.right;
+ }
+}
diff --git a/src/main/java/technology/tabula/ProjectionProfile.java b/src/main/java/technology/tabula/ProjectionProfile.java
index d80f18b0..39ab9e41 100644
--- a/src/main/java/technology/tabula/ProjectionProfile.java
+++ b/src/main/java/technology/tabula/ProjectionProfile.java
@@ -73,7 +73,7 @@ public float[] getHorizontalProjection() {
public float[] findVerticalSeparators(float minColumnWidth) {
boolean foundNarrower = false;
- List verticalSeparators = new ArrayList();
+ List verticalSeparators = new ArrayList<>();
for (Ruling r: area.getVerticalRulings()) {
if (r.length() / this.textBounds.getHeight() >= 0.95) {
verticalSeparators.add(toFixed(r.getPosition() - this.areaLeft));
@@ -105,7 +105,7 @@ public float[] findVerticalSeparators(float minColumnWidth) {
public float[] findHorizontalSeparators(float minRowHeight) {
boolean foundShorter = false;
- List horizontalSeparators = new ArrayList();
+ List horizontalSeparators = new ArrayList<>();
for (Ruling r: area.getHorizontalRulings()) {
System.out.println(r.length() / this.textBounds.getWidth());
if (r.length() / this.textBounds.getWidth() >= 0.95) {
@@ -136,7 +136,7 @@ public float[] findHorizontalSeparators(float minRowHeight) {
}
private static List findSeparatorsFromProjection(float[] derivative) {
- List separators = new ArrayList();
+ List separators = new ArrayList<>();
Integer lastNeg = null;
float s;
boolean positiveSlope = false;
@@ -167,7 +167,7 @@ public static float[] smooth(float[] data, int kernelSize) {
+ kernelSize / 2, data.length); j++) {
s += data[j];
}
- rv[i] = (float) Math.floor(s / (float) kernelSize);
+ rv[i] = (float) Math.floor(s / kernelSize);
}
}
return rv;
@@ -213,7 +213,7 @@ private static int toFixed(double value) {
}
private static double toDouble(int value) {
- return (double) value / Math.pow(10, DECIMAL_PLACES);
+ return value / Math.pow(10, DECIMAL_PLACES);
}
}
diff --git a/src/main/java/technology/tabula/QuickSort.java b/src/main/java/technology/tabula/QuickSort.java
index 21d26dd5..03388a15 100644
--- a/src/main/java/technology/tabula/QuickSort.java
+++ b/src/main/java/technology/tabula/QuickSort.java
@@ -16,94 +16,97 @@
*/
package technology.tabula;
+import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
+import java.util.RandomAccess;
import java.util.Stack;
/**
- * see http://de.wikipedia.org/wiki/Quicksort.
+ * An implementation of Quicksort.
+ *
+ * @see wikipedia
*
* @author UWe Pachler
*/
-public class QuickSort
-{
-
- private QuickSort()
- {
- }
-
- private static final Comparator extends Comparable> objComp = new Comparator()
- {
- public int compare(Comparable object1, Comparable object2)
- {
- return object1.compareTo(object2);
- }
- };
+public final class QuickSort {
+
+ private QuickSort() {
+ // utility
+ }
+
+ /**
+ * Sorts the given list according to natural order.
+ */
+ public static > void sort(List list) {
+ sort(list, QuickSort.naturalOrder()); // JAVA_8 replace with Comparator.naturalOrder() (and cleanup)
+ }
+
+ /**
+ * Sorts the given list using the given comparator.
+ */
+ public static void sort(List list, Comparator super T> comparator) {
+ if (list instanceof RandomAccess) {
+ quicksort(list, comparator);
+ } else {
+ List copy = new ArrayList<>(list);
+ quicksort(copy, comparator);
+ list.clear();
+ list.addAll(copy);
+ }
+ }
- /**
- * Sorts the given list using the given comparator.
- */
- public static void sort(List list, Comparator cmp)
- {
- quicksort(list, cmp);
- }
+ private static void quicksort(List list, Comparator super T> cmp) {
+ Stack stack = new Stack<>();
+ stack.push(0);
+ stack.push(list.size());
+ while (!stack.isEmpty()) {
+ int right = stack.pop();
+ int left = stack.pop();
+
+ if (right - left < 2) continue;
+ int p = left + ((right - left) / 2);
+ p = partition(list, cmp, p, left, right);
- /**
- * Sorts the given list using compareTo as comparator.
- */
- public static void sort(List list)
- {
- sort(list, (Comparator) objComp);
- }
+ stack.push(p + 1);
+ stack.push(right);
- private static void quicksort(List list, Comparator cmp)
- {
- Stack stack = new Stack();
- stack.push(0);
- stack.push(list.size());
- while (!stack.isEmpty()) {
- int right = stack.pop();
- int left = stack.pop();
- if (right - left < 2) continue;
- int p = left + ((right-left)/2);
- p = partition(list, cmp, p, left, right);
-
- stack.push(p+1);
- stack.push(right);
+ stack.push(left);
+ stack.push(p);
+ }
+ }
- stack.push(left);
- stack.push(p);
+ private static int partition(List list, Comparator super T> cmp, int p, int start, int end) {
+ int l = start;
+ int h = end - 2;
+ T piv = list.get(p);
+ swap(list, p, end - 1);
- }
- }
-
- private static int partition(List list, Comparator cmp, int p, int start, int end) {
- int l = start;
- int h = end - 2;
- T piv = list.get(p);
- swap(list,p,end-1);
+ while (l < h) {
+ if (cmp.compare(list.get(l), piv) <= 0) l++;
+ else if (cmp.compare(piv, list.get(h)) <= 0) h--;
+ else swap(list, l, h);
+ }
+ int idx = h;
+ if (cmp.compare(list.get(h), piv) < 0) idx++;
+ swap(list, end - 1, idx);
+ return idx;
+ }
- while (l < h) {
- if (cmp.compare(list.get(l), piv) <= 0) {
- l++;
- } else if (cmp.compare(piv, list.get(h)) <= 0) {
- h--;
- } else {
- swap(list,l,h);
- }
- }
- int idx = h;
- if (cmp.compare(list.get(h), piv) < 0) idx++;
- swap(list,end-1,idx);
- return idx;
- }
-
+ private static void swap(List list, int i, int j) {
+ T tmp = list.get(i);
+ list.set(i, list.get(j));
+ list.set(j, tmp);
+ }
- private static void swap(List list, int i, int j)
- {
- T tmp = list.get(i);
- list.set(i, list.get(j));
- list.set(j, tmp);
- }
+ @SuppressWarnings({ "rawtypes", "unchecked" })
+ private static final Comparator NATURAL_ORDER = new Comparator() {
+ @Override public int compare(Object l, Object r) { return ((Comparable) l).compareTo(r); }
+ };
+
+ @SuppressWarnings("unchecked")
+ private static > Comparator naturalOrder() {
+ return NATURAL_ORDER;
+ }
}
diff --git a/src/main/java/technology/tabula/Rectangle.java b/src/main/java/technology/tabula/Rectangle.java
index 41b79374..b96fcd77 100644
--- a/src/main/java/technology/tabula/Rectangle.java
+++ b/src/main/java/technology/tabula/Rectangle.java
@@ -2,171 +2,177 @@
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
+import java.util.Comparator;
import java.util.List;
+import java.util.Locale;
@SuppressWarnings("serial")
-public class Rectangle extends Rectangle2D.Float implements Comparable {
-
- protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
-
- public Rectangle() {
- super();
- }
-
- public Rectangle(float top, float left, float width, float height) {
- super();
- this.setRect(left, top, width, height);
- }
-
- @Override
- public int compareTo(Rectangle other) {
- double thisBottom = this.getBottom();
- double otherBottom = other.getBottom();
- int rv;
-
- if (this.equals(other)) return 0;
-
- if (this.verticalOverlap(other) > VERTICAL_COMPARISON_THRESHOLD) {
- rv = java.lang.Double.compare(this.getX(), other.getX());
- }
- else {
- rv = java.lang.Double.compare(thisBottom, otherBottom);
- }
- return rv;
- }
-
- // I'm bad at Java and need this for fancy sorting in technology.tabula.TextChunk.
- public int isLtrDominant(){
- return 0;
- }
-
-
- public float getArea() {
- return this.width * this.height;
- }
-
- public float verticalOverlap(Rectangle other) {
- return (float) Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
- }
-
- public boolean verticallyOverlaps(Rectangle other) {
- return verticalOverlap(other) > 0;
- }
-
- public float horizontalOverlap(Rectangle other) {
- return (float) Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
- }
-
- public boolean horizontallyOverlaps(Rectangle other) {
- return horizontalOverlap(other) > 0;
- }
-
- public float verticalOverlapRatio(Rectangle other) {
- float rv = 0,
- delta = (float) Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
-
- if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
- rv = (float) ((other.getBottom() - this.getTop()) / delta);
- }
- else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
- rv = (float) ((this.getBottom() - other.getTop()) / delta);
- }
- else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
- rv = (float) ((other.getBottom() - other.getTop()) / delta);
- }
- else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
- rv = (float) ((this.getBottom() - this.getTop()) / delta);
- }
-
- return rv;
-
- }
-
- public float overlapRatio(Rectangle other) {
- double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
- double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
- double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
- double unionArea = this.getArea() + other.getArea() - intersectionArea;
-
- return (float) (intersectionArea / unionArea);
- }
-
- public Rectangle merge(Rectangle other) {
- this.setRect(this.createUnion(other));
- return this;
- }
-
- public float getTop() {
- return (float) this.getMinY();
- }
-
- public void setTop(float top) {
- float deltaHeight = top - this.y;
- this.setRect(this.x, top, this.width, this.height - deltaHeight);
- }
-
- public float getRight() {
- return (float) this.getMaxX();
- }
-
- public void setRight(float right) {
- this.setRect(this.x, this.y, right - this.x, this.height);
- }
-
- public float getLeft() {
- return (float) this.getMinX();
- }
-
- public void setLeft(float left) {
- float deltaWidth = left - this.x;
- this.setRect(left, this.y, this.width - deltaWidth, this.height);
- }
-
- public float getBottom() {
- return (float) this.getMaxY();
- }
-
- public void setBottom(float bottom) {
- this.setRect(this.x, this.y, this.width, bottom - this.y);
- }
-
- public Point2D[] getPoints() {
- return new Point2D[] {
- new Point2D.Float((float) this.getLeft(), (float) this.getTop()),
- new Point2D.Float((float) this.getRight(), (float) this.getTop()),
- new Point2D.Float((float) this.getRight(), (float) this.getBottom()),
- new Point2D.Float((float) this.getLeft(), (float) this.getBottom())
- };
- }
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder();
- String s = super.toString();
- sb.append(s.substring(0, s.length() - 1));
- sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
- return sb.toString();
- }
-
-
- /**
- * @param rectangles
- * @return minimum bounding box that contains all the rectangles
- */
- public static Rectangle boundingBoxOf(List extends Rectangle> rectangles) {
- float minx = java.lang.Float.MAX_VALUE;
- float miny = java.lang.Float.MAX_VALUE;
- float maxx = java.lang.Float.MIN_VALUE;
- float maxy = java.lang.Float.MIN_VALUE;
-
- for (Rectangle r: rectangles) {
- minx = (float) Math.min(r.getMinX(), minx);
- miny = (float) Math.min(r.getMinY(), miny);
- maxx = (float) Math.max(r.getMaxX(), maxx);
- maxy = (float) Math.max(r.getMaxY(), maxy);
- }
- return new Rectangle(miny, minx, maxx - minx, maxy - miny);
- }
-
+public class Rectangle extends Rectangle2D.Float {
+
+ /**
+ * Ill-defined comparator, from when Rectangle was Comparable.
+ *
+ * @see PR 116
+ * @deprecated with no replacement
+ */
+ @Deprecated
+ public static final Comparator ILL_DEFINED_ORDER = new Comparator() {
+ @Override public int compare(Rectangle o1, Rectangle o2) {
+ if (o1.equals(o2)) return 0;
+ if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
+ return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1
+ ? - java.lang.Double.compare(o1.getX(), o2.getX())
+ : java.lang.Double.compare(o1.getX(), o2.getX());
+ } else {
+ return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
+ }
+ }
+ };
+
+ protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
+
+ public Rectangle() {
+ super();
+ }
+
+ public Rectangle(float top, float left, float width, float height) {
+ super();
+ this.setRect(left, top, width, height);
+ }
+
+ public int compareTo(Rectangle other) {
+ return ILL_DEFINED_ORDER.compare(this, other);
+ }
+
+ // I'm bad at Java and need this for fancy sorting in
+ // technology.tabula.TextChunk.
+ public int isLtrDominant() {
+ return 0;
+ }
+
+ public float getArea() {
+ return this.width * this.height;
+ }
+
+ public float verticalOverlap(Rectangle other) {
+ return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
+ }
+
+ public boolean verticallyOverlaps(Rectangle other) {
+ return verticalOverlap(other) > 0;
+ }
+
+ public float horizontalOverlap(Rectangle other) {
+ return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
+ }
+
+ public boolean horizontallyOverlaps(Rectangle other) {
+ return horizontalOverlap(other) > 0;
+ }
+
+ public float verticalOverlapRatio(Rectangle other) {
+ float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
+
+ if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom()
+ && other.getBottom() <= this.getBottom()) {
+ rv = (other.getBottom() - this.getTop()) / delta;
+ } else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom()
+ && this.getBottom() <= other.getBottom()) {
+ rv = (this.getBottom() - other.getTop()) / delta;
+ } else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom()
+ && other.getBottom() <= this.getBottom()) {
+ rv = (other.getBottom() - other.getTop()) / delta;
+ } else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom()
+ && this.getBottom() <= other.getBottom()) {
+ rv = (this.getBottom() - this.getTop()) / delta;
+ }
+
+ return rv;
+
+ }
+
+ public float overlapRatio(Rectangle other) {
+ double intersectionWidth = Math.max(0,
+ Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
+ double intersectionHeight = Math.max(0,
+ Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
+ double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
+ double unionArea = this.getArea() + other.getArea() - intersectionArea;
+
+ return (float) (intersectionArea / unionArea);
+ }
+
+ public Rectangle merge(Rectangle other) {
+ this.setRect(this.createUnion(other));
+ return this;
+ }
+
+ public float getTop() {
+ return (float) this.getMinY();
+ }
+
+ public void setTop(float top) {
+ float deltaHeight = top - this.y;
+ this.setRect(this.x, top, this.width, this.height - deltaHeight);
+ }
+
+ public float getRight() {
+ return (float) this.getMaxX();
+ }
+
+ public void setRight(float right) {
+ this.setRect(this.x, this.y, right - this.x, this.height);
+ }
+
+ public float getLeft() {
+ return (float) this.getMinX();
+ }
+
+ public void setLeft(float left) {
+ float deltaWidth = left - this.x;
+ this.setRect(left, this.y, this.width - deltaWidth, this.height);
+ }
+
+ public float getBottom() {
+ return (float) this.getMaxY();
+ }
+
+ public void setBottom(float bottom) {
+ this.setRect(this.x, this.y, this.width, bottom - this.y);
+ }
+
+ public Point2D[] getPoints() {
+ return new Point2D[] { new Point2D.Float(this.getLeft(), this.getTop()),
+ new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(), this.getBottom()),
+ new Point2D.Float(this.getLeft(), this.getBottom()) };
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ String s = super.toString();
+ sb.append(s.substring(0, s.length() - 1));
+ sb.append(String.format(Locale.US, ",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
+ return sb.toString();
+ }
+
+ /**
+ * @param rectangles
+ * @return minimum bounding box that contains all the rectangles
+ */
+ public static Rectangle boundingBoxOf(List extends Rectangle> rectangles) {
+ float minx = java.lang.Float.MAX_VALUE;
+ float miny = java.lang.Float.MAX_VALUE;
+ float maxx = java.lang.Float.MIN_VALUE;
+ float maxy = java.lang.Float.MIN_VALUE;
+
+ for (Rectangle r : rectangles) {
+ minx = (float) Math.min(r.getMinX(), minx);
+ miny = (float) Math.min(r.getMinY(), miny);
+ maxx = (float) Math.max(r.getMaxX(), maxx);
+ maxy = (float) Math.max(r.getMaxY(), maxy);
+ }
+ return new Rectangle(miny, minx, maxx - minx, maxy - miny);
+ }
}
diff --git a/src/main/java/technology/tabula/RectangleSpatialIndex.java b/src/main/java/technology/tabula/RectangleSpatialIndex.java
index 498106db..0e942545 100644
--- a/src/main/java/technology/tabula/RectangleSpatialIndex.java
+++ b/src/main/java/technology/tabula/RectangleSpatialIndex.java
@@ -1,79 +1,39 @@
package technology.tabula;
-import gnu.trove.procedure.TIntProcedure;
-
import java.util.ArrayList;
import java.util.List;
-import net.sf.jsi.SpatialIndex;
-import net.sf.jsi.rtree.RTree;
+import org.locationtech.jts.geom.Envelope;
+import org.locationtech.jts.index.strtree.STRtree;
-class RectangleSpatialIndex {
+public class RectangleSpatialIndex {
- class SaveToListProcedure implements TIntProcedure {
- private List ids = new ArrayList();
-
- public boolean execute(int id) {
- ids.add(id);
- return true;
- }
- private List getIds() {
- return ids;
- }
- }
+ private final STRtree si = new STRtree();
+ private final List rectangles = new ArrayList<>();
- private final SpatialIndex si;
- private final List rectangles;
- private Rectangle bounds = null;
-
- public RectangleSpatialIndex() {
- si = new RTree();
- si.init(null);
- rectangles = new ArrayList();
- }
-
public void add(T te) {
rectangles.add(te);
- if (bounds == null) {
- bounds = new Rectangle();
- bounds.setRect(te);
- }
- else {
- bounds.merge(te);
- }
- si.add(rectangleToSpatialIndexRectangle(te), rectangles.size() - 1);
+ si.insert(new Envelope(te.getLeft(), te.getRight(), te.getBottom(), te.getTop()), te);
}
public List contains(Rectangle r) {
- SaveToListProcedure proc = new SaveToListProcedure();
- si.contains(rectangleToSpatialIndexRectangle(r), proc);
- ArrayList rv = new ArrayList();
- for (int i : proc.getIds()) {
- rv.add(rectangles.get(i));
+ List intersection = si.query(new Envelope(r.getLeft(), r.getRight(), r.getTop(), r.getBottom()));
+ List rv = new ArrayList();
+
+ for (T ir: intersection) {
+ if (r.contains(ir)) {
+ rv.add(ir);
+ }
}
- Utils.sort(rv);
+
+ Utils.sort(rv, Rectangle.ILL_DEFINED_ORDER);
return rv;
}
public List intersects(Rectangle r) {
- SaveToListProcedure proc = new SaveToListProcedure();
- si.intersects(rectangleToSpatialIndexRectangle(r), proc);
- ArrayList rv = new ArrayList();
- for (int i : proc.getIds()) {
- rv.add(rectangles.get(i));
- }
- Utils.sort(rv);
- return rv;
- }
-
- private net.sf.jsi.Rectangle rectangleToSpatialIndexRectangle(Rectangle r) {
- return new net.sf.jsi.Rectangle((float) r.getX(),
- (float) r.getY(),
- (float) (r.getX() + r.getWidth()),
- (float) (r.getY() + r.getHeight()));
+ return si.query(new Envelope(r.getLeft(), r.getRight(), r.getTop(), r.getBottom()));
}
-
/**
* Minimum bounding box of all the Rectangles contained on this RectangleSpatialIndex
@@ -81,7 +41,7 @@ private net.sf.jsi.Rectangle rectangleToSpatialIndexRectangle(Rectangle r) {
* @return a Rectangle
*/
public Rectangle getBounds() {
- return bounds;
+ return Rectangle.boundingBoxOf(rectangles);
}
}
diff --git a/src/main/java/technology/tabula/RectangularTextContainer.java b/src/main/java/technology/tabula/RectangularTextContainer.java
index f9e0036f..934b5f13 100644
--- a/src/main/java/technology/tabula/RectangularTextContainer.java
+++ b/src/main/java/technology/tabula/RectangularTextContainer.java
@@ -1,35 +1,51 @@
package technology.tabula;
+import java.util.ArrayList;
import java.util.List;
@SuppressWarnings("serial")
-public abstract class RectangularTextContainer extends Rectangle {
-
- public RectangularTextContainer(float top, float left, float width, float height) {
- super(top, left, width, height);
- }
-
- public String toString() {
- StringBuilder sb = new StringBuilder();
- String s = super.toString();
- sb.append(s.substring(0, s.length() - 1));
- sb.append(String.format(",text=%s]", this.getText() == null ? "null" : "\"" + this.getText() + "\""));
- return sb.toString();
- }
-
- public RectangularTextContainer merge(RectangularTextContainer other) {
- if (this.compareTo(other) < 0) {
- this.getTextElements().addAll(other.getTextElements());
-
- }
- else {
- this.getTextElements().addAll(0, other.getTextElements());
- }
- super.merge(other);
- return this;
- }
-
- public abstract String getText();
- public abstract String getText(boolean useLineReturns);
- public abstract List getTextElements();
+public class RectangularTextContainer extends Rectangle implements HasText {
+
+ protected List textElements = new ArrayList<>();
+
+ protected RectangularTextContainer(float top, float left, float width, float height) {
+ super(top, left, width, height);
+ }
+
+ public RectangularTextContainer merge(RectangularTextContainer other) {
+ if (compareTo(other) < 0) {
+ this.getTextElements().addAll(other.getTextElements());
+ } else {
+ this.getTextElements().addAll(0, other.getTextElements());
+ }
+ super.merge(other);
+ return this;
+ }
+
+ public List getTextElements() {
+ return textElements;
+ }
+
+ public void setTextElements(List textElements) {
+ this.textElements = textElements;
+ }
+
+ @Override
+ public String getText() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public String getText(boolean useLineReturns) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override public String toString() {
+ StringBuilder sb = new StringBuilder();
+ String s = super.toString();
+ sb.append(s.substring(0, s.length() - 1));
+ sb.append(String.format(",text=%s]", this.getText() == null ? "null" : "\"" + this.getText() + "\""));
+ return sb.toString();
+ }
+
}
diff --git a/src/main/java/technology/tabula/Ruling.java b/src/main/java/technology/tabula/Ruling.java
index 8eb16b5e..213ce87f 100644
--- a/src/main/java/technology/tabula/Ruling.java
+++ b/src/main/java/technology/tabula/Ruling.java
@@ -8,6 +8,7 @@
import java.util.Comparator;
import java.util.Formatter;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
@@ -39,9 +40,6 @@ public void normalize() {
else if (Utils.within(angle, 90, 1) || Utils.within(angle, 270, 1)) { // almost vertical
this.setLine(this.x1, this.y1, this.x1, this.y2);
}
-// else {
-// System.out.println("oblique: " + this + " ("+ this.getAngle() + ")");
-// }
}
public boolean vertical() {
@@ -230,11 +228,6 @@ public boolean equals(Object other) {
return this.getP1().equals(o.getP1()) && this.getP2().equals(o.getP2());
}
- @Override
- public int hashCode() {
- return super.hashCode();
- }
-
public float getTop() {
return this.y1;
}
@@ -291,13 +284,13 @@ public double getAngle() {
public String toString() {
StringBuilder sb = new StringBuilder();
Formatter formatter = new Formatter(sb);
- String rv = formatter.format("%s[x1=%f y1=%f x2=%f y2=%f]", this.getClass().toString(), this.x1, this.y1, this.x2, this.y2).toString();
+ String rv = formatter.format(Locale.US, "%s[x1=%f y1=%f x2=%f y2=%f]", this.getClass().toString(), this.x1, this.y1, this.x2, this.y2).toString();
formatter.close();
return rv;
}
public static List cropRulingsToArea(List rulings, Rectangle2D area) {
- ArrayList rv = new ArrayList();
+ ArrayList rv = new ArrayList<>();
for (Ruling r : rulings) {
if (r.intersects(area)) {
rv.add(r.intersect(area));
@@ -322,15 +315,15 @@ public SortObject(SOType type, float position, Ruling ruling) {
}
}
- List sos = new ArrayList();
+ List sos = new ArrayList<>();
- TreeMap tree = new TreeMap(new Comparator() {
+ TreeMap tree = new TreeMap<>(new Comparator() {
@Override
public int compare(Ruling o1, Ruling o2) {
return java.lang.Double.compare(o1.getTop(), o2.getTop());
}});
- TreeMap rv = new TreeMap(new Comparator() {
+ TreeMap rv = new TreeMap<>(new Comparator() {
@Override
public int compare(Point2D o1, Point2D o2) {
if (o1.getY() > o2.getY()) return 1;
@@ -409,7 +402,7 @@ public static List collapseOrientedRulings(List lines) {
}
public static List collapseOrientedRulings(List lines, int expandAmount) {
- ArrayList rv = new ArrayList();
+ ArrayList rv = new ArrayList<>();
Collections.sort(lines, new Comparator() {
@Override
public int compare(Ruling a, Ruling b) {
diff --git a/src/main/java/technology/tabula/Table.java b/src/main/java/technology/tabula/Table.java
index eda11251..1e73bedf 100644
--- a/src/main/java/technology/tabula/Table.java
+++ b/src/main/java/technology/tabula/Table.java
@@ -8,139 +8,98 @@
@SuppressWarnings("serial")
public class Table extends Rectangle {
-
- class CellPosition implements Comparable {
- int row, col;
- CellPosition(int row, int col) {
- this.row = row; this.col = col;
- }
-
- @Override
- public boolean equals(Object other) {
- if (this == other)
- return true;
- if (!(other instanceof CellPosition))
- return false;
- return other != null && this.row == ((CellPosition) other).row && this.col == ((CellPosition) other).col;
- }
-
- @Override
- public int hashCode() {
- return this.row * 100000 + this.col;
- }
-
- @Override
- public int compareTo(CellPosition other) {
- int rv = 0;
- if(this.row < other.row) {
- rv = -1;
- }
- else if (this.row > other.row) {
- rv = 1;
- }
- else if (this.col > other.col) {
- rv = 1;
- }
- else if (this.col < other.col) {
- rv = -1;
- }
- return rv;
- }
- }
-
- class CellContainer extends TreeMap {
-
- public int maxRow = 0, maxCol = 0;
-
- public RectangularTextContainer get(int row, int col) {
- return this.get(new CellPosition(row, col));
- }
-
- public List getRow(int row) {
- return new ArrayList(this.subMap(new CellPosition(row, 0), new CellPosition(row, maxRow+1)).values());
- }
-
- @Override
- public RectangularTextContainer put(CellPosition cp, RectangularTextContainer value) {
- this.maxRow = Math.max(maxRow, cp.row);
- this.maxCol = Math.max(maxCol, cp.col);
- if (this.containsKey(cp)) { // adding on an existing CellPosition, concatenate content and resize
- value.merge(this.get(cp));
- }
- super.put(cp, value);
- return value;
- }
-
- @Override
- public RectangularTextContainer get(Object key) {
- return this.containsKey(key) ? super.get(key) : TextChunk.EMPTY;
- }
-
- public boolean containsKey(int row, int col) {
- return this.containsKey(new CellPosition(row, col));
- }
-
- }
-
- public static final Table EMPTY = new Table();
-
- CellContainer cellContainer = new CellContainer();
- Page page;
- ExtractionAlgorithm extractionAlgorithm;
- List> rows = null;
-
- public Table() {
- super();
- }
-
- public Table(Page page, ExtractionAlgorithm extractionAlgorithm) {
- this();
- this.page = page;
- this.extractionAlgorithm = extractionAlgorithm;
- }
-
- public void add(RectangularTextContainer tc, int i, int j) {
- this.merge(tc);
- this.cellContainer.put(new CellPosition(i, j), tc);
- this.rows = null; // clear the memoized rows
- }
-
- public List> getRows() {
- if (this.rows != null) {
- return this.rows;
- }
-
- this.rows = new ArrayList>();
- for (int i = 0; i <= this.cellContainer.maxRow; i++) {
- List lastRow = new ArrayList();
- this.rows.add(lastRow);
- for (int j = 0; j <= this.cellContainer.maxCol; j++) {
- lastRow.add(this.cellContainer.containsKey(i, j) ? this.cellContainer.get(i, j) : TextChunk.EMPTY);
- }
- }
- return this.rows;
- }
-
- public RectangularTextContainer getCell(int i, int j) {
- return this.cellContainer.get(i, j);
- }
-
- public List> getCols() {
- return Utils.transpose(this.getRows());
- }
-
- public void setExtractionAlgorithm(ExtractionAlgorithm extractionAlgorithm) {
- this.extractionAlgorithm = extractionAlgorithm;
- }
-
- public ExtractionAlgorithm getExtractionAlgorithm() {
- return extractionAlgorithm;
- }
-
- public List getCells() {
- return (List) new ArrayList(this.cellContainer.values());
- }
-
-
+
+ public static final Table empty() { return new Table(""); }
+
+ private Table(String extractionMethod) {
+ this.extractionMethod = extractionMethod;
+ }
+
+ public Table(ExtractionAlgorithm extractionAlgorithm) {
+ this(extractionAlgorithm.toString());
+ }
+
+ private final String extractionMethod;
+
+ private int rowCount = 0;
+ private int colCount = 0;
+ private int pageNumber = 0;
+
+ /* visible for testing */ final TreeMap cells = new TreeMap<>();
+
+ public int getRowCount() { return rowCount; }
+ public int getColCount() { return colCount; }
+ public int getPageNumber() { return pageNumber; }
+ public void setPageNumber(int pageNumber) { this.pageNumber = pageNumber; }
+
+ public String getExtractionMethod() { return extractionMethod; }
+
+ public void add(RectangularTextContainer chunk, int row, int col) {
+ this.merge(chunk);
+
+ rowCount = Math.max(rowCount, row + 1);
+ colCount = Math.max(colCount, col + 1);
+
+ CellPosition cp = new CellPosition(row, col);
+
+ RectangularTextContainer old = cells.get(cp);
+ if (old != null) chunk.merge(old);
+ cells.put(cp, chunk);
+
+ this.memoizedRows = null;
+ }
+
+ private List> memoizedRows = null;
+
+ public List> getRows() {
+ if (this.memoizedRows == null) this.memoizedRows = computeRows();
+ return this.memoizedRows;
+ }
+
+ private List> computeRows() {
+ List> rows = new ArrayList<>();
+ for (int i = 0; i < rowCount; i++) {
+ List lastRow = new ArrayList<>();
+ rows.add(lastRow);
+ for (int j = 0; j < colCount; j++) {
+ RectangularTextContainer cell = cells.get(new CellPosition(i,j)); // JAVA_8 use getOrDefault()
+ lastRow.add(cell != null ? cell : TextChunk.EMPTY);
+ }
+ }
+ return rows;
+ }
+
+ public RectangularTextContainer getCell(int i, int j) {
+ RectangularTextContainer cell = cells.get(new CellPosition(i,j)); // JAVA_8 use getOrDefault()
+ return cell != null ? cell : TextChunk.EMPTY;
+ }
+
+}
+
+class CellPosition implements Comparable {
+
+ CellPosition(int row, int col) {
+ this.row = row;
+ this.col = col;
+ }
+
+ final int row, col;
+
+ @Override public int hashCode() {
+ return row + 101 * col;
+ }
+
+ @Override public boolean equals(Object obj) {
+ if (this == obj) return true;
+ if (obj == null) return false;
+ if (getClass() != obj.getClass()) return false;
+ CellPosition other = (CellPosition) obj;
+ return row == other.row && col == other.col;
+ }
+
+ @Override public int compareTo(CellPosition other) {
+ int rowdiff = row - other.row;
+ return rowdiff != 0 ? rowdiff : col - other.col;
+ }
}
diff --git a/src/main/java/technology/tabula/TableWithRulingLines.java b/src/main/java/technology/tabula/TableWithRulingLines.java
index 54de67e2..cde0ce72 100644
--- a/src/main/java/technology/tabula/TableWithRulingLines.java
+++ b/src/main/java/technology/tabula/TableWithRulingLines.java
@@ -6,25 +6,21 @@
import java.util.Iterator;
import java.util.List;
+import technology.tabula.extractors.ExtractionAlgorithm;
+
@SuppressWarnings("serial")
public class TableWithRulingLines extends Table {
List verticalRulings, horizontalRulings;
- RectangleSpatialIndex si = new RectangleSpatialIndex();
+ RectangleSpatialIndex si = new RectangleSpatialIndex<>();
- public TableWithRulingLines() {
- super();
- }
-
- public TableWithRulingLines(Rectangle area, Page page, List cells,
- List horizontalRulings,
- List verticalRulings) {
- this();
+ public TableWithRulingLines(Rectangle area, List cells, List horizontalRulings, List verticalRulings, ExtractionAlgorithm extractionAlgorithm, int pageNumber) {
+ super(extractionAlgorithm);
this.setRect(area);
- this.page = page;
this.verticalRulings = verticalRulings;
this.horizontalRulings = horizontalRulings;
this.addCells(cells);
+ this.setPageNumber(pageNumber);
}
private void addCells(List