diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 00000000..a217b347
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,7 @@
+version: 2
+updates:
+- package-ecosystem: maven
+ directory: "/"
+ schedule:
+ interval: daily
+ open-pull-requests-limit: 10
diff --git a/.github/workflows/tests-windows.yml b/.github/workflows/tests-windows.yml
new file mode 100644
index 00000000..5cc1031a
--- /dev/null
+++ b/.github/workflows/tests-windows.yml
@@ -0,0 +1,23 @@
+name: Java CI (Windows)
+
+on: [push]
+
+jobs:
+ build:
+ runs-on: windows-latest
+
+ steps:
+ # https://github.com/actions/checkout/issues/135#issuecomment-602171132
+ - name: Set git to use LF
+ run: |
+ git config --global core.autocrlf false
+ git config --global core.eol lf
+ - uses: actions/checkout@v3
+ - name: Set up JDK 11
+ uses: actions/setup-java@v3
+ with:
+ java-version: '11'
+ distribution: 'adopt'
+ cache: maven
+ - name: Build with Maven
+ run: mvn --batch-mode test
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 00000000..da2d019b
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,18 @@
+name: Java CI
+
+on: [push, pull_request]
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v3
+ - name: Set up JDK 11
+ uses: actions/setup-java@v3
+ with:
+ java-version: '11'
+ distribution: 'adopt'
+ cache: maven
+ - name: Build with Maven
+ run: mvn --batch-mode test
diff --git a/.gitignore b/.gitignore
index 712da218..0247d35b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,9 @@
.settings/
+.idea/
.project
.classpath
/bin/
+/src/test/**/*.jpg
+/src/test/resources/technology/tabula/icdar2013-dataset/test-statistics.json
/target/
+*.iml
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 2a5ffc72..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,10 +0,0 @@
-language: java
-script: mvn test
-jdk:
- - oraclejdk7
- - openjdk7
- - oraclejdk8
-sudo: false
-
-
-
diff --git a/LICENSE b/LICENSE
index 06bdd025..4beb04ee 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
The MIT License (MIT)
-Copyright (c) 2014 Manuel Aristarán
+Copyright (c) 2014-2016 Manuel Aristarán
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
\ No newline at end of file
+SOFTWARE.
diff --git a/README.md b/README.md
index 614d67ab..db7b0023 100644
--- a/README.md
+++ b/README.md
@@ -1,62 +1,158 @@
-tabula-java [](https://travis-ci.org/tabulapdf/tabula-java)
+tabula-java [](https://travis-ci.org/tabulapdf/tabula-java)
===========
-`tabula-java` is a library for extracting tables from PDF files. It is a Java rewrite of [`tabula-extractor`](http://github.com/tabulapdf/tabula-extractor), that is soon to become a thin wrapper around this library.
+`tabula-java` is a library for extracting tables from PDF files — it is the table extraction engine that powers [Tabula](http://tabula.technology/) ([repo](http://github.com/tabulapdf/tabula)). You can use `tabula-java` as a command-line tool to programmatically extract tables from PDFs.
-## Build instructions
+© 2014-2020 Manuel Aristarán. Available under MIT License. See [`LICENSE`](LICENSE).
-Clone this repo and run:
+## Download
-```
-mvn clean compile assembly:single
-```
+Download a version of the tabula-java's jar, with all dependencies included, that works on Mac, Windows and Linux from our [releases page](../../releases).
-## Examples
+## Commandline Usage Examples
`tabula-java` provides a command line application:
```
-$ java -jar ./target/tabula-extractor-0.7.4-SNAPSHOT-jar-with-dependencies.jar --help
-
-usage: tabula [-a ] [-c ] [-d] [-f ] [-g] [-h] [-i]
- [-n] [-o ] [-p ] [-r] [-s ] [-u] [-v]
+$ java -jar target/tabula-1.0.5-jar-with-dependencies.jar --help
+usage: tabula [-a ] [-b ] [-c ] [-f ]
+ [-g] [-h] [-i] [-l] [-n] [-o ] [-p ] [-r] [-s
+ ] [-t] [-u] [-v]
Tabula helps you extract tables from PDFs
- -a,--area Portion of the page to analyze
- (top,left,bottom,right). Example: --area
- 269.875,12.75,790.5,561. Default is entire
- page
+
+ -a,--area -a/--area = Portion of the page to analyze.
+ Example: --area 269.875,12.75,790.5,561.
+ Accepts top,left,bottom,right i.e. y1,x1,y2,x2
+ where all values are in points relative to the
+ top left corner. If all values are between
+ 0-100 (inclusive) and preceded by '%', input
+ will be taken as % of actual height or width
+ of the page. Example: --area %0,0,100,50. To
+ specify multiple areas, -a option should be
+ repeated. Default is entire page
+ -b,--batch Convert all .pdfs in the provided directory.
-c,--columns X coordinates of column boundaries. Example
- --columns 10.1,20.2,30.3
- -d,--debug Print detected table areas instead of
- processing.
+ --columns 10.1,20.2,30.3. If all values are
+ between 0-100 (inclusive) and preceded by '%',
+ input will be taken as % of actual width of
+ the page. Example: --columns %25,50,80.6
-f,--format Output format: (CSV,TSV,JSON). Default: CSV
-g,--guess Guess the portion of the page to analyze per
page.
-h,--help Print this help text.
-i,--silent Suppress all stderr output.
- -n,--no-spreadsheet Force PDF not to be extracted using
- spreadsheet-style extraction (if there are
- ruling lines separating each cell, as in a PDF
- of an Excel spreadsheet)
+ -l,--lattice Force PDF to be extracted using lattice-mode
+ extraction (if there are ruling lines
+ separating each cell, as in a PDF of an Excel
+ spreadsheet)
+ -n,--no-spreadsheet [Deprecated in favor of -t/--stream] Force PDF
+ not to be extracted using spreadsheet-style
+ extraction (if there are no ruling lines
+ separating each cell)
-o,--outfile Write output to instead of STDOUT.
Default: -
-p,--pages Comma separated list of ranges, or all.
Examples: --pages 1-3,5-7, --pages 3 or
--pages all. Default is --pages 1
- -r,--spreadsheet Force PDF to be extracted using
- spreadsheet-style extraction (if there are
- ruling lines separating each cell, as in a PDF
- of an Excel spreadsheet)
+ -r,--spreadsheet [Deprecated in favor of -l/--lattice] Force
+ PDF to be extracted using spreadsheet-style
+ extraction (if there are ruling lines
+ separating each cell, as in a PDF of an Excel
+ spreadsheet)
-s,--password Password to decrypt document. Default is empty
+ -t,--stream Force PDF to be extracted using stream-mode
+ extraction (if there are no ruling lines
+ separating each cell)
-u,--use-line-returns Use embedded line returns in cells. (Only in
spreadsheet mode.)
-v,--version Print version and exit.
-
```
-It also includes a debugging tool, run `java -cp ./target/tabula-extractor-0.7.4-SNAPSHOT-jar-with-dependencies.jar technology.tabula.debug.Debug -h` for the available options.
+It also includes a debugging tool, run `java -cp ./target/tabula-1.0.5-jar-with-dependencies.jar technology.tabula.debug.Debug -h` for the available options.
You can also integrate `tabula-java` with any JVM language. For Java examples, see the [`tests`](src/test/java/technology/tabula/) folder.
-© 2014 Manuel Aristarán. Available under MIT License. See [`LICENSE`](LICENSE).
+JVM start-up time is a lot of the cost of the `tabula` command, so if you're trying to extract many tables from PDFs, you have a few options for speeding it up:
+
+ - the -b option, which allows you to convert all pdfs in a given directory
+ - the [drip](https://github.com/ninjudd/drip) utility
+ - the [Ruby](http://github.com/tabulapdf/tabula-extractor), [Python](https://github.com/chezou/tabula-py), [R](https://github.com/leeper/tabulizer), and [Node.js](https://github.com/ezodude/tabula-js) bindings
+ - writing your own program in any JVM language (Java, JRuby, Scala) that imports tabula-java.
+ - waiting for us to implement an API/server-style system (it's on the [roadmap](https://github.com/tabulapdf/tabula-api))
+
+## API Usage Examples
+
+A simple Java code example which extracts all rows and cells from all tables of all pages of a PDF document:
+
+```java
+InputStream in = this.getClass().getResourceAsStream("my.pdf");
+try (PDDocument document = PDDocument.load(in)) {
+ SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
+ PageIterator pi = new ObjectExtractor(document).extract();
+ while (pi.hasNext()) {
+ // iterate over the pages of the document
+ Page page = pi.next();
+ List table = sea.extract(page);
+ // iterate over the tables of the page
+ for(Table tables: table) {
+ List> rows = tables.getRows();
+ // iterate over the rows of the table
+ for (List cells : rows) {
+ // print all column-cells of the row plus linefeed
+ for (RectangularTextContainer content : cells) {
+ // Note: Cell.getText() uses \r to concat text chunks
+ String text = content.getText().replace("\r", " ");
+ System.out.print(text + "|");
+ }
+ System.out.println();
+ }
+ }
+ }
+}
+```
+
+
+For more detail information check the Javadoc.
+The Javadoc API documentation can be generated (see also '_Building from Source_' section) via
+
+```
+mvn javadoc:javadoc
+```
+
+which generates the HTML files to directory ```target/site/apidocs/```
+
+## Building from Source
+
+Clone this repo and run:
+
+```
+mvn clean compile assembly:single
+```
+
+## Contributing
+
+Interested in helping out? We'd love to have your help!
+
+You can help by:
+
+- [Reporting a bug](https://github.com/tabulapdf/tabula-java/issues).
+- Adding or editing documentation.
+- Contributing code via a Pull Request.
+- Spreading the word about `tabula-java` to people who might be able to benefit from using it.
+
+### Backers
+
+You can also support our continued work on `tabula-java` with a one-time or monthly donation [on OpenCollective](https://opencollective.com/tabulapdf#support). Organizations who use `tabula-java` can also [sponsor the project](https://opencollective.com/tabulapdf#support) for acknowledgement on [our official site](http://tabula.technology/) and this README.
+
+Special thanks to the following users and organizations for generously supporting Tabula with donations and grants:
+
+
+
+
+
+
+
+
+
+
diff --git a/jbang-catalog.json b/jbang-catalog.json
new file mode 100644
index 00000000..b7f71347
--- /dev/null
+++ b/jbang-catalog.json
@@ -0,0 +1,8 @@
+{
+ "catalogs": {},
+ "aliases": {
+ "tabula": {
+ "script-ref": "https://github.com/tabulapdf/tabula-java/releases/download/v1.0.4/tabula-1.0.4-jar-with-dependencies.jar"
+ }
+ }
+}
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 991ccd28..211d0d4d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1,59 +1,156 @@
4.0.0
- tabula-extractor
- tabula-extractor
- 0.7.4-SNAPSHOT
- tabula-extractor
+ technology.tabula
+ tabula
+ 1.0.6-SNAPSHOT
+ Tabula
Extract tables from PDF files
+ http://github.com/tabulapdf/tabula-java
-
- UTF-8
- UTF-8
-
+
+
+ MIT License
+ http://www.opensource.org/licenses/mit-license.php
+
+
-
-
-
- sonatype
- Sonatype repository
- https://oss.sonatype.org/content/repositories/snapshots/
-
+
+
+ Manuel Aristaran
+ Tabula
+ http://github.com/tabulapdf
+
+
+ Jeremy B. Merrill
+ Tabula
+ http://github.com/tabulapdf
+
+
+ Mike Tigas
+ Tabula
+ http://github.com/tabulapdf
+
+
+
+ snapshots
+ https://repository.apache.org/content/repositories/snapshots/
+
+ false
+
true
- always
- warn
- apachesnapshots
- apache snapshots
- http://repository.apache.org/snapshots/
- default
-
-
+
+
+ scm:git:git@github.com:tabulapdf/tabula-java.git
+ scm:git:git@github.com:tabulapdf/tabula-java.git
+ git@github.com:tabulapdf/tabula-java.git
+ v1.0.2
+
+
+
+ UTF-8
+ UTF-8
+
+
+
+
+ ossrh
+ https://oss.sonatype.org/content/repositories/snapshots
+
+
+ ossrh
+ https://oss.sonatype.org/service/local/staging/deploy/maven2/
+
+
+
-
-
- org.apache.maven.plugins
- maven-javadoc-plugin
- 2.10.3
-
-
-
-
-
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 3.8.0
+
+ true
+
+
+
+
+
+ org.sonatype.plugins
+ nexus-staging-maven-plugin
+ 1.7.0
+ true
+
+ ossrh
+ https://oss.sonatype.org/
+ 40766864c3b853
+ true
+
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+ 3.3.1
+
+
+ attach-sources
+
+ jar-no-fork
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 3.8.0
+
+ 8
+
+
+
+ attach-javadocs
+
+ jar
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-gpg-plugin
+ 3.2.4
+
+
+ sign-artifacts
+ verify
+
+ sign
+
+
+
+ --pinentry-mode
+ loopback
+
+
+
+
+
maven-compiler-plugin
- 3.1
+ 3.13.0
- 1.6
- 1.6
+ 1.8
+ 1.8
@@ -69,91 +166,159 @@
-
+ -Xms1024m -Xmx2048m
-
-
- first
-
- update-file-header
-
- process-sources
-
- gpl_v3
-
- *Dao.java
- *Bean.java
- .*entities.*.xml
-
-
-
-
-
+
+
+
+ org.apache.maven.plugins
+ maven-eclipse-plugin
+ 2.10
+
+ true
+ true
+
+
+
+
+ release
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 3.8.0
+
+ 8
+
+
+
+ attach-javadocs
+
+ jar
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+ 3.3.1
+
+
+ attach-sources
+
+ jar-no-fork
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-gpg-plugin
+ 3.2.4
+
+
+ sign-artifacts
+ verify
+
+ sign
+
+
+
+
+
+
+
+
+
- net.sf.jsi
- jsi
- 1.1.0-SNAPSHOT
+ org.locationtech.jts
+ jts-core
+ 1.20.0
org.slf4j
slf4j-api
- 1.6.4
+ 2.0.13
org.slf4j
slf4j-simple
- 1.6.4
+ 2.0.13
org.apache.pdfbox
pdfbox
- 1.8.10
+ 3.0.4
org.bouncycastle
- bcprov-jdk15
- 1.44
+ bcprov-jdk18on
+ 1.80
+
org.bouncycastle
- bcmail-jdk15
- 1.44
+ bcmail-jdk18on
+ 1.80
junit
junit
- 4.11
+ 4.13.2
test
+
commons-cli
commons-cli
- 1.2
+ 1.8.0
+
org.apache.commons
commons-csv
- 1.0
+ 1.11.0
+
com.google.code.gson
gson
- 2.2.4
+ 2.11.0
+
+
+
+ com.github.jai-imageio
+ jai-imageio-core
+ 1.4.0
+
+
+
+ com.github.jai-imageio
+ jai-imageio-jpeg2000
+ 1.4.0
+
+
+
+ org.apache.pdfbox
+ jbig2-imageio
+ 3.0.4
diff --git a/src/main/java/technology/tabula/Cell.java b/src/main/java/technology/tabula/Cell.java
index b7e568db..d02c8c50 100644
--- a/src/main/java/technology/tabula/Cell.java
+++ b/src/main/java/technology/tabula/Cell.java
@@ -1,75 +1,62 @@
package technology.tabula;
import java.awt.geom.Point2D;
-import java.util.ArrayList;
import java.util.Collections;
-import java.util.List;
@SuppressWarnings("serial")
public class Cell extends RectangularTextContainer {
- private boolean spanning;
- private boolean placeholder;
- private List textElements;
-
- public Cell(float top, float left, float width, float height) {
- super(top, left, width, height);
- this.setPlaceholder(false);
- this.setSpanning(false);
- this.setTextElements(new ArrayList());
- }
-
- public Cell(Point2D topLeft, Point2D bottomRight) {
- super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
- this.setPlaceholder(false);
- this.setSpanning(false);
- this.setTextElements(new ArrayList());
- }
-
- @Override
- public String getText(boolean useLineReturns) {
- if (this.textElements.size() == 0) {
- return "";
- }
- StringBuilder sb = new StringBuilder();
- Collections.sort(this.textElements);
- double curTop = this.textElements.get(0).getTop();
- for (TextChunk tc: this.textElements) {
- if (useLineReturns && tc.getTop() > curTop) {
- sb.append('\r');
- }
- sb.append(tc.getText());
- curTop = tc.getTop();
- }
- return sb.toString().trim();
- }
-
- public String getText() {
- return getText(true);
- }
-
- public boolean isSpanning() {
- return spanning;
- }
-
- public void setSpanning(boolean spanning) {
- this.spanning = spanning;
- }
-
- public boolean isPlaceholder() {
- return placeholder;
- }
-
- public void setPlaceholder(boolean placeholder) {
- this.placeholder = placeholder;
- }
-
-
- public List getTextElements() {
- return textElements;
- }
-
- public void setTextElements(List textElements) {
- this.textElements = textElements;
- }
+ public Cell(float top, float left, float width, float height) {
+ super(top, left, width, height);
+ this.setPlaceholder(false);
+ this.setSpanning(false);
+ }
+
+ public Cell(Point2D topLeft, Point2D bottomRight) {
+ super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
+ this.setPlaceholder(false);
+ this.setSpanning(false);
+ }
+
+ private boolean spanning;
+ private boolean placeholder;
+
+ @Override
+ public String getText(boolean useLineReturns) {
+ if (this.textElements.size() == 0) {
+ return "";
+ }
+ StringBuilder sb = new StringBuilder();
+ this.textElements.sort(Rectangle.ILL_DEFINED_ORDER);
+ double curTop = this.textElements.get(0).getTop();
+ for (TextChunk tc : this.textElements) {
+ if (useLineReturns && tc.getTop() > curTop) {
+ sb.append('\r');
+ }
+ sb.append(tc.getText());
+ curTop = tc.getTop();
+ }
+ return sb.toString().trim();
+ }
+
+ @Override
+ public String getText() {
+ return getText(true);
+ }
+
+ public boolean isSpanning() {
+ return spanning;
+ }
+
+ public void setSpanning(boolean spanning) {
+ this.spanning = spanning;
+ }
+
+ public boolean isPlaceholder() {
+ return placeholder;
+ }
+
+ public void setPlaceholder(boolean placeholder) {
+ this.placeholder = placeholder;
+ }
}
diff --git a/src/main/java/technology/tabula/CohenSutherlandClipping.java b/src/main/java/technology/tabula/CohenSutherlandClipping.java
index b37dce9c..db9153e9 100644
--- a/src/main/java/technology/tabula/CohenSutherlandClipping.java
+++ b/src/main/java/technology/tabula/CohenSutherlandClipping.java
@@ -18,122 +18,124 @@
* Implements the well known Cohen Sutherland line
* clipping algorithm (line against clip rectangle).
*/
-public final class CohenSutherlandClipping
-{
+public final class CohenSutherlandClipping {
+
private double xMin;
private double yMin;
private double xMax;
private double yMax;
+ private static final int INSIDE = 0;
+ private static final int LEFT = 1;
+ private static final int RIGHT = 2;
+ private static final int BOTTOM = 4;
+ private static final int TOP = 8;
+
+ private final static float MINIMUM_DELTA = 0.01f;
+
/**
- * Creates a Cohen Sutherland clipper with clip rect (0, 0, 0, 0).
+ * Creates a Cohen Sutherland clipper with clip window (0, 0, 0, 0).
*/
- public CohenSutherlandClipping() {
- }
+ public CohenSutherlandClipping() {}
/**
- * Creates a Cohen Sutherland clipper with the given clip rectangle.
- * @param clip the clip rectangle to use
+ * Creates a Cohen Sutherland clipper with the given clip window.
+ * @param clipWindow the clip window to use.
*/
- public CohenSutherlandClipping(Rectangle2D clip) {
- setClip(clip);
+ public CohenSutherlandClipping(Rectangle2D clipWindow) {
+ setClip(clipWindow);
}
/**
* Sets the clip rectangle.
- * @param clip the clip rectangle
+ * @param clipWindow the clip window.
*/
- public void setClip(Rectangle2D clip) {
- xMin = clip.getX();
- xMax = xMin + clip.getWidth();
- yMin = clip.getY();
- yMax = yMin + clip.getHeight();
- }
-
- private static final int INSIDE = 0;
- private static final int LEFT = 1;
- private static final int RIGHT = 2;
- private static final int BOTTOM = 4;
- private static final int TOP = 8;
-
- private final int regionCode(double x, double y) {
- int code = x < xMin
- ? LEFT
- : x > xMax
- ? RIGHT
- : INSIDE;
- if (y < yMin) code |= BOTTOM;
- else if (y > yMax) code |= TOP;
- return code;
+ public void setClip(Rectangle2D clipWindow) {
+ xMin = clipWindow.getX();
+ xMax = xMin + clipWindow.getWidth();
+ yMin = clipWindow.getY();
+ yMax = yMin + clipWindow.getHeight();
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
/**
- * Clips a given line against the clip rectangle.
+ * Clips a given line against the clip window.
* The modification (if needed) is done in place.
- * @param line the line to clip
+ * @param line the line to clip.
* @return true if line is clipped, false if line is
- * totally outside the clip rect.
+ * totally outside the clip window.
*/
public boolean clip(Line2D.Float line) {
+ Point point1 = new Point(line.getX1(), line.getY1());
+ Point point2 = new Point(line.getX2(), line.getY2());
+ Point outsidePoint = new Point(0d, 0d);
- double p1x = line.getX1();
- double p1y = line.getY1();
- double p2x = line.getX2();
- double p2y = line.getY2();
+ boolean lineIsVertical = (point1.x == point2.x);
+ double lineSlope = lineIsVertical ? 0d : (point2.y-point1.y)/(point2.x-point1.x);
- double qx = 0d;
- double qy = 0d;
+ while (point1.region != INSIDE || point2.region != INSIDE) {
+ if ((point1.region & point2.region) != 0) return false;
- boolean vertical = p1x == p2x;
+ outsidePoint.region = (point1.region == INSIDE) ? point2.region : point1.region;
- double slope = vertical
- ? 0d
- : (p2y-p1y)/(p2x-p1x);
-
- int c1 = regionCode(p1x, p1y);
- int c2 = regionCode(p2x, p2y);
-
- while (c1 != INSIDE || c2 != INSIDE) {
-
- if ((c1 & c2) != INSIDE)
- return false;
-
- int c = c1 == INSIDE ? c2 : c1;
-
- if ((c & LEFT) != INSIDE) {
- qx = xMin;
- qy = (qx-p1x)*slope + p1y;
+ if ((outsidePoint.region & LEFT) != 0) {
+ outsidePoint.x = xMin;
+ outsidePoint.y = delta(outsidePoint.x, point1.x)*lineSlope + point1.y;
}
- else if ((c & RIGHT) != INSIDE) {
- qx = xMax;
- qy = (qx-p1x)*slope + p1y;
+ else if ((outsidePoint.region & RIGHT) != 0) {
+ outsidePoint.x = xMax;
+ outsidePoint.y = delta(outsidePoint.x, point1.x)*lineSlope + point1.y;
}
- else if ((c & BOTTOM) != INSIDE) {
- qy = yMin;
- qx = vertical
- ? p1x
- : (qy-p1y)/slope + p1x;
+ else if ((outsidePoint.region & BOTTOM) != 0) {
+ outsidePoint.y = yMin;
+ outsidePoint.x = lineIsVertical
+ ? point1.x
+ : delta(outsidePoint.y, point1.y)/lineSlope + point1.x;
}
- else if ((c & TOP) != INSIDE) {
- qy = yMax;
- qx = vertical
- ? p1x
- : (qy-p1y)/slope + p1x;
+ else if ((outsidePoint.region & TOP) != 0) {
+ outsidePoint.y = yMax;
+ outsidePoint.x = lineIsVertical
+ ? point1.x
+ : delta(outsidePoint.y, point1.y)/lineSlope + point1.x;
}
- if (c == c1) {
- p1x = qx;
- p1y = qy;
- c1 = regionCode(p1x, p1y);
+ if (outsidePoint.isInTheSameRegionAs(point1)) {
+ point1.setPositionAndRegion(outsidePoint.x, outsidePoint.y);
}
else {
- p2x = qx;
- p2y = qy;
- c2 = regionCode(p2x, p2y);
+ point2.setPositionAndRegion(outsidePoint.x, outsidePoint.y);
}
}
- line.setLine(p1x, p1y, p2x, p2y);
+ line.setLine(point1.x, point1.y, point2.x, point2.y);
return true;
}
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ private static double delta(double value1, double value2) {
+ return (Math.abs(value1 - value2) < MINIMUM_DELTA) ? 0 : (value1 - value2);
+ }
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ class Point {
+ double x, y;
+ int region;
+
+ Point(double x, double y) {
+ setPositionAndRegion(x, y);
+ }
+
+ void setPositionAndRegion(double x, double y) {
+ this.x = x; this.y = y;
+ region = (x < xMin) ? LEFT : (x > xMax) ? RIGHT : INSIDE;
+ if (y < yMin)
+ region |= BOTTOM;
+ else if (y > yMax)
+ region |= TOP;
+ }
+
+ boolean isInTheSameRegionAs(Point otherPoint) {
+ return this.region == otherPoint.region;
+ }
+ }
+
}
-// end of file
\ No newline at end of file
diff --git a/src/main/java/technology/tabula/CommandLineApp.java b/src/main/java/technology/tabula/CommandLineApp.java
index ca755307..1b422303 100644
--- a/src/main/java/technology/tabula/CommandLineApp.java
+++ b/src/main/java/technology/tabula/CommandLineApp.java
@@ -2,20 +2,24 @@
import java.io.BufferedWriter;
import java.io.File;
+import java.io.FilenameFilter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Iterator;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
-import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
+
+import technology.tabula.detectors.DetectionAlgorithm;
+import technology.tabula.detectors.NurminenDetectionAlgorithm;
import technology.tabula.extractors.BasicExtractionAlgorithm;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
import technology.tabula.writers.CSVWriter;
@@ -26,239 +30,472 @@
public class CommandLineApp {
- private static String VERSION = "0.8.0";
- private static String VERSION_STRING = String.format("tabula %s (c) 2012-2014 Manuel Aristarán", VERSION);
+ private static String VERSION = "1.0.6-SNAPSHOT";
+ private static String VERSION_STRING = String.format("tabula %s (c) 2012-2020 Manuel Aristarán", VERSION);
private static String BANNER = "\nTabula helps you extract tables from PDFs\n\n";
+ private static final int RELATIVE_AREA_CALCULATION_MODE = 0;
+ private static final int ABSOLUTE_AREA_CALCULATION_MODE = 1;
+
+
+ private Appendable defaultOutput;
+
+ private List> pageAreas;
+ private List pages;
+ private OutputFormat outputFormat;
+ private String password;
+ private TableExtractor tableExtractor;
+
+ public CommandLineApp(Appendable defaultOutput, CommandLine line) throws ParseException {
+ this.defaultOutput = defaultOutput;
+ this.pageAreas = CommandLineApp.whichAreas(line);
+ this.pages = CommandLineApp.whichPages(line);
+ this.outputFormat = CommandLineApp.whichOutputFormat(line);
+ this.tableExtractor = CommandLineApp.createExtractor(line);
+
+ if (line.hasOption('s')) {
+ this.password = line.getOptionValue('s');
+ }
+ }
+
public static void main(String[] args) {
- CommandLineParser parser = new GnuParser();
+ CommandLineParser parser = new DefaultParser();
try {
// parse the command line arguments
- CommandLine line = parser.parse(buildOptions(), args );
-
+ CommandLine line = parser.parse(buildOptions(), args);
+
if (line.hasOption('h')) {
printHelp();
System.exit(0);
}
-
+
if (line.hasOption('v')) {
System.out.println(VERSION_STRING);
System.exit(0);
}
-
- if (line.getArgs().length != 1) {
- throw new ParseException("Need one filename\nTry --help for help");
- }
-
- extractTables(line);
-
- }
- catch( ParseException exp ) {
+
+ new CommandLineApp(System.out, line).extractTables(line);
+ } catch (ParseException exp) {
System.err.println("Error: " + exp.getMessage());
System.exit(1);
}
System.exit(0);
}
-
- static void extractTables(CommandLine line) throws ParseException {
+
+ public void extractTables(CommandLine line) throws ParseException {
+ if (line.hasOption('b')) {
+ if (line.getArgs().length != 0) {
+ throw new ParseException("Filename specified with batch\nTry --help for help");
+ }
+
+ File pdfDirectory = new File(line.getOptionValue('b'));
+ if (!pdfDirectory.isDirectory()) {
+ throw new ParseException("Directory does not exist or is not a directory");
+ }
+ extractDirectoryTables(line, pdfDirectory);
+ return;
+ }
+
+ if (line.getArgs().length != 1) {
+ throw new ParseException("Need exactly one filename\nTry --help for help");
+ }
+
File pdfFile = new File(line.getArgs()[0]);
if (!pdfFile.exists()) {
throw new ParseException("File does not exist");
}
-
- OutputFormat of = OutputFormat.CSV;
- if (line.hasOption('f')) {
- try {
- of = OutputFormat.valueOf(line.getOptionValue('f'));
- }
- catch (IllegalArgumentException e) {
- throw new ParseException(String.format(
- "format %s is illegal. Available formats: %s",
- line.getOptionValue('f'),
- Utils.join(",", OutputFormat.formatNames())));
+ extractFileTables(line, pdfFile);
+ }
+
+ public void extractDirectoryTables(CommandLine line, File pdfDirectory) throws ParseException {
+ File[] pdfs = pdfDirectory.listFiles(new FilenameFilter() {
+ public boolean accept(File dir, String name) {
+ return name.endsWith(".pdf");
}
-
+ });
+
+ for (File pdfFile : pdfs) {
+ File outputFile = new File(getOutputFilename(pdfFile));
+ try {
+ extractFileInto(pdfFile, outputFile);
+ } catch (ParseException e) {
+ System.err.println("Caught exception while processing file: " + pdfFile.toString());
+ throw e;
+ }
}
-
- Appendable outFile = System.out;
- if (line.hasOption('o')) {
- File file = new File(line.getOptionValue('o'));
-
- try {
- file.createNewFile();
- outFile = new BufferedWriter(new FileWriter(
- file.getAbsoluteFile()));
- } catch (IOException e) {
- throw new ParseException("Cannot create file "
- + line.getOptionValue('o'));
- }
+ }
+
+ public void extractFileTables(CommandLine line, File pdfFile) throws ParseException {
+ if (!line.hasOption('o')) {
+ extractFile(pdfFile, this.defaultOutput);
+ return;
}
-
- Rectangle area = null;
- if (line.hasOption('a')) {
- List f = parseFloatList(line.getOptionValue('a'));
- if (f.size() != 4) {
- throw new ParseException("area parameters must be top,left,bottom,right");
+
+ File outputFile = new File(line.getOptionValue('o'));
+ extractFileInto(pdfFile, outputFile);
+ }
+
+ public void extractFileInto(File pdfFile, File outputFile) throws ParseException {
+ BufferedWriter bufferedWriter = null;
+ try {
+ FileWriter fileWriter = new FileWriter(outputFile.getAbsoluteFile());
+ bufferedWriter = new BufferedWriter(fileWriter);
+
+ outputFile.createNewFile();
+ extractFile(pdfFile, bufferedWriter);
+ } catch (IOException e) {
+ throw new ParseException("Cannot create file " + outputFile);
+ } finally {
+ if (bufferedWriter != null) {
+ try {
+ bufferedWriter.close();
+ } catch (IOException e) {
+ System.out.println("Error in closing the BufferedWriter" + e);
+ }
}
- area = new Rectangle(f.get(0), f.get(1), f.get(3) - f.get(1), f.get(2) - f.get(0));
}
-
- List verticalRulingPositions = null;
- if (line.hasOption('c')) {
- verticalRulingPositions = parseFloatList(line.getOptionValue('c'));
- }
-
- String pagesOption = line.hasOption('p') ? line.getOptionValue('p') : "1";
- List pages = Utils.parsePagesOption(pagesOption);
- ExtractionMethod method = whichExtractionMethod(line);
- boolean useLineReturns = line.hasOption('u');
-
+ }
+
+ private void extractFile(File pdfFile, Appendable outFile) throws ParseException {
+ PDDocument pdfDocument = null;
try {
-
- ObjectExtractor oe = line.hasOption('s') ?
- new ObjectExtractor(PDDocument.load(pdfFile), line.getOptionValue('s')) :
- new ObjectExtractor(PDDocument.load(pdfFile));
- BasicExtractionAlgorithm basicExtractor = new BasicExtractionAlgorithm();
- SpreadsheetExtractionAlgorithm spreadsheetExtractor = new SpreadsheetExtractionAlgorithm();
-
- PageIterator pageIterator = pages == null ? oe.extract() : oe.extract(pages);
- Page page;
- List tables = new ArrayList();
+ pdfDocument = this.password == null ? Loader.loadPDF(pdfFile) : Loader.loadPDF(pdfFile,password);
+ PageIterator pageIterator = getPageIterator(pdfDocument);
+ List tables = new ArrayList<>();
while (pageIterator.hasNext()) {
- page = pageIterator.next();
-
- if (area != null) {
- page = page.getArea(area);
- }
+ Page page = pageIterator.next();
- if (method == ExtractionMethod.DECIDE) {
- method = spreadsheetExtractor.isTabular(page) ? ExtractionMethod.SPREADSHEET : ExtractionMethod.BASIC;
+ if (tableExtractor.verticalRulingPositions != null) {
+ for (Float verticalRulingPosition : tableExtractor.verticalRulingPositions) {
+ page.addRuling(new Ruling(0, verticalRulingPosition, 0.0f, (float) page.getHeight()));
+ }
}
-
- switch(method) {
- case BASIC:
- if (line.hasOption('g')) {
-
+
+ if (pageAreas != null) {
+ for (Pair areaPair : pageAreas) {
+ Rectangle area = areaPair.getRight();
+ if (areaPair.getLeft() == RELATIVE_AREA_CALCULATION_MODE) {
+ area = new Rectangle((float) (area.getTop() / 100 * page.getHeight()),
+ (float) (area.getLeft() / 100 * page.getWidth()), (float) (area.getWidth() / 100 * page.getWidth()),
+ (float) (area.getHeight() / 100 * page.getHeight()));
+ }
+ tables.addAll(tableExtractor.extractTables(page.getArea(area)));
}
- tables.addAll(verticalRulingPositions == null ? basicExtractor.extract(page) : basicExtractor.extract(page, verticalRulingPositions));
-
- break;
- case SPREADSHEET:
- // TODO add useLineReturns
- tables.addAll(spreadsheetExtractor.extract(page));
- default:
- break;
+ } else {
+ tables.addAll(tableExtractor.extractTables(page));
}
- writeTables(of, tables, outFile);
- tables.clear();
}
-
+ writeTables(tables, outFile);
} catch (IOException e) {
throw new ParseException(e.getMessage());
+ } finally {
+ try {
+ if (pdfDocument != null) {
+ pdfDocument.close();
+ }
+ } catch (IOException e) {
+ System.out.println("Error in closing pdf document" + e);
+ }
}
+ }
+ private PageIterator getPageIterator(PDDocument pdfDocument) throws IOException {
+ ObjectExtractor extractor = new ObjectExtractor(pdfDocument);
+ return (pages == null) ?
+ extractor.extract() :
+ extractor.extract(pages);
}
-
- static void writeTables(OutputFormat format, List tables, Appendable out) throws IOException {
- Writer writer = null;
- switch (format) {
- case CSV:
- writer = new CSVWriter();
- break;
- case JSON:
- writer = new JSONWriter();
- break;
- case TSV:
- writer = new TSVWriter();
- break;
+
+ // CommandLine parsing methods
+
+ private static OutputFormat whichOutputFormat(CommandLine line) throws ParseException {
+ if (!line.hasOption('f')) {
+ return OutputFormat.CSV;
}
- writer.write(out, tables);
+
+ try {
+ return OutputFormat.valueOf(line.getOptionValue('f'));
+ } catch (IllegalArgumentException e) {
+ throw new ParseException(String.format(
+ "format %s is illegal. Available formats: %s",
+ line.getOptionValue('f'),
+ Utils.join(",", OutputFormat.formatNames())));
+ }
+ }
+
+ private static List> whichAreas(CommandLine line) throws ParseException {
+ if (!line.hasOption('a')) {
+ return null;
+ }
+
+ String[] optionValues = line.getOptionValues('a');
+
+ List> areaList = new ArrayList>();
+ for (String optionValue : optionValues) {
+ int areaCalculationMode = ABSOLUTE_AREA_CALCULATION_MODE;
+ int startIndex = 0;
+ if (optionValue.startsWith("%")) {
+ startIndex = 1;
+ areaCalculationMode = RELATIVE_AREA_CALCULATION_MODE;
+ }
+ List f = parseFloatList(optionValue.substring(startIndex));
+ if (f.size() != 4) {
+ throw new ParseException("area parameters must be top,left,bottom,right optionally preceded by %");
+ }
+ areaList.add(new Pair(areaCalculationMode, new Rectangle(f.get(0), f.get(1), f.get(3) - f.get(1), f.get(2) - f.get(0))));
+ }
+ return areaList;
}
-
- static ExtractionMethod whichExtractionMethod(CommandLine line) {
- ExtractionMethod rv = ExtractionMethod.DECIDE;
- if (line.hasOption('r')) {
- rv = ExtractionMethod.SPREADSHEET;
+
+ private static List whichPages(CommandLine line) throws ParseException {
+ String pagesOption = line.hasOption('p') ? line.getOptionValue('p') : "1";
+ return Utils.parsePagesOption(pagesOption);
+ }
+
+ private static ExtractionMethod whichExtractionMethod(CommandLine line) {
+ // -r/--spreadsheet [deprecated; use -l] or -l/--lattice
+ if (line.hasOption('r') || line.hasOption('l')) {
+ return ExtractionMethod.SPREADSHEET;
+ }
+
+ // -n/--no-spreadsheet [deprecated; use -t] or -c/--columns or -g/--guess or -t/--stream
+ if (line.hasOption('n') || line.hasOption('c') || line.hasOption('t')) {
+ return ExtractionMethod.BASIC;
}
- else if (line.hasOption('n') || line.hasOption('c') || line.hasOption('a') || line.hasOption('g')) {
- rv = ExtractionMethod.BASIC;
+ return ExtractionMethod.DECIDE;
+ }
+
+ private static TableExtractor createExtractor(CommandLine line) throws ParseException {
+ TableExtractor extractor = new TableExtractor();
+ extractor.setGuess(line.hasOption('g'));
+ extractor.setMethod(CommandLineApp.whichExtractionMethod(line));
+ extractor.setUseLineReturns(line.hasOption('u'));
+
+ if (line.hasOption('c')) {
+ String optionString = line.getOptionValue('c');
+ if (optionString.startsWith("%")) {
+ extractor.setVerticalRulingPositionsRelative(true);
+ optionString = optionString.substring(1);
+ }
+ extractor.setVerticalRulingPositions(parseFloatList(optionString));
}
- return rv;
+
+ return extractor;
}
-
-
-
+
+ // utilities, etc.
+
public static List parseFloatList(String option) throws ParseException {
String[] f = option.split(",");
- List rv = new ArrayList();
+ List rv = new ArrayList<>();
try {
- for (int i = 0; i < f.length; i++) {
- rv.add(Float.parseFloat(f[i]));
+ for (final String element : f) {
+ rv.add(Float.parseFloat(element));
}
return rv;
} catch (NumberFormatException e) {
throw new ParseException("Wrong number syntax");
}
}
-
+
private static void printHelp() {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("tabula", BANNER, buildOptions(), "", true);
}
-
- @SuppressWarnings("static-access")
- static Options buildOptions() {
+
+ public static Options buildOptions() {
Options o = new Options();
-
+
o.addOption("v", "version", false, "Print version and exit.");
o.addOption("h", "help", false, "Print this help text.");
o.addOption("g", "guess", false, "Guess the portion of the page to analyze per page.");
- o.addOption("d", "debug", false, "Print detected table areas instead of processing");
- o.addOption("r", "spreadsheet", false, "Force PDF to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)");
- o.addOption("n", "no-spreadsheet", false, "Force PDF not to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)");
+ o.addOption("r", "spreadsheet", false, "[Deprecated in favor of -l/--lattice] Force PDF to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)");
+ o.addOption("n", "no-spreadsheet", false, "[Deprecated in favor of -t/--stream] Force PDF not to be extracted using spreadsheet-style extraction (if there are no ruling lines separating each cell)");
+ o.addOption("l", "lattice", false, "Force PDF to be extracted using lattice-mode extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)");
+ o.addOption("t", "stream", false, "Force PDF to be extracted using stream-mode extraction (if there are no ruling lines separating each cell)");
o.addOption("i", "silent", false, "Suppress all stderr output.");
o.addOption("u", "use-line-returns", false, "Use embedded line returns in cells. (Only in spreadsheet mode.)");
- o.addOption("d", "debug", false, "Print detected table areas instead of processing.");
- o.addOption(OptionBuilder.withLongOpt("outfile")
- .withDescription("Write output to instead of STDOUT. Default: -")
- .hasArg()
- .withArgName("OUTFILE")
- .create("o"));
- o.addOption(OptionBuilder.withLongOpt("format")
- .withDescription("Output format: (" + Utils.join(",", OutputFormat.formatNames()) + "). Default: CSV")
- .hasArg()
- .withArgName("FORMAT")
- .create("f"));
- o.addOption(OptionBuilder.withLongOpt("password")
- .withDescription("Password to decrypt document. Default is empty")
- .hasArg()
- .withArgName("PASSWORD")
- .create("s"));
- o.addOption(OptionBuilder.withLongOpt("columns")
- .withDescription("X coordinates of column boundaries. Example --columns 10.1,20.2,30.3")
- .hasArg()
- .withArgName("COLUMNS")
- .create("c"));
- o.addOption(OptionBuilder.withLongOpt("area")
- .withDescription("Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page")
- .hasArg()
- .withArgName("AREA")
- .create("a"));
- o.addOption(OptionBuilder.withLongOpt("pages")
- .withDescription("Comma separated list of ranges, or all. Examples: --pages 1-3,5-7, --pages 3 or --pages all. Default is --pages 1")
- .hasArg()
- .withArgName("PAGES")
- .create("p"));
-
+ // o.addOption("d", "debug", false, "Print detected table areas instead of processing.");
+ o.addOption(Option.builder("b")
+ .longOpt("batch")
+ .desc("Convert all .pdfs in the provided directory.")
+ .hasArg()
+ .argName("DIRECTORY")
+ .build());
+ o.addOption(Option.builder("o")
+ .longOpt("outfile")
+ .desc("Write output to instead of STDOUT. Default: -")
+ .hasArg()
+ .argName("OUTFILE")
+ .build());
+ o.addOption(Option.builder("f")
+ .longOpt("format")
+ .desc("Output format: (" + Utils.join(",", OutputFormat.formatNames()) + "). Default: CSV")
+ .hasArg()
+ .argName("FORMAT")
+ .build());
+ o.addOption(Option.builder("s")
+ .longOpt("password")
+ .desc("Password to decrypt document. Default is empty")
+ .hasArg()
+ .argName("PASSWORD")
+ .build());
+ o.addOption(Option.builder("c")
+ .longOpt("columns")
+ .desc("X coordinates of column boundaries. Example --columns 10.1,20.2,30.3. "
+ + "If all values are between 0-100 (inclusive) and preceded by '%', input will be taken as % of actual width of the page. "
+ + "Example: --columns %25,50,80.6")
+ .hasArg()
+ .argName("COLUMNS")
+ .build());
+ o.addOption(Option.builder("a")
+ .longOpt("area")
+ .desc("-a/--area = Portion of the page to analyze. Example: --area 269.875,12.75,790.5,561. "
+ + "Accepts top,left,bottom,right i.e. y1,x1,y2,x2 where all values are in points relative to the top left corner. "
+ + "If all values are between 0-100 (inclusive) and preceded by '%', input will be taken as % of actual height or width of the page. "
+ + "Example: --area %0,0,100,50. To specify multiple areas, -a option should be repeated. Default is entire page")
+ .hasArg()
+ .argName("AREA")
+ .build());
+ o.addOption(Option.builder("p")
+ .longOpt("pages")
+ .desc("Comma separated list of ranges, or all. Examples: --pages 1-3,5-7, --pages 3 or --pages all. Default is --pages 1")
+ .hasArg()
+ .argName("PAGES")
+ .build());
+
return o;
}
-
+
+ private static class TableExtractor {
+ private boolean guess = false;
+ private boolean useLineReturns = false;
+ private BasicExtractionAlgorithm basicExtractor = new BasicExtractionAlgorithm();
+ private SpreadsheetExtractionAlgorithm spreadsheetExtractor = new SpreadsheetExtractionAlgorithm();
+
+ private boolean verticalRulingPositionsRelative = false;
+ private List verticalRulingPositions = null;
+
+ private ExtractionMethod method = ExtractionMethod.BASIC;
+
+ public TableExtractor() {
+ }
+
+ public void setVerticalRulingPositions(List positions) {
+ this.verticalRulingPositions = positions;
+ }
+
+ public void setVerticalRulingPositionsRelative(boolean relative) {
+ this.verticalRulingPositionsRelative = relative;
+ }
+
+ public void setGuess(boolean guess) {
+ this.guess = guess;
+ }
+
+ public void setUseLineReturns(boolean useLineReturns) {
+ this.useLineReturns = useLineReturns;
+ }
+
+ public void setMethod(ExtractionMethod method) {
+ this.method = method;
+ }
+
+ public List extractTables(Page page) {
+ ExtractionMethod effectiveMethod = this.method;
+ if (effectiveMethod == ExtractionMethod.DECIDE) {
+ effectiveMethod = spreadsheetExtractor.isTabular(page) ?
+ ExtractionMethod.SPREADSHEET :
+ ExtractionMethod.BASIC;
+ }
+ switch (effectiveMethod) {
+ case BASIC:
+ return extractTablesBasic(page);
+ case SPREADSHEET:
+ return extractTablesSpreadsheet(page);
+ default:
+ return new ArrayList<>();
+ }
+ }
+
+ public List extractTablesBasic(Page page) {
+ if (guess) {
+ // guess the page areas to extract using a detection algorithm
+ // currently we only have a detector that uses spreadsheets to find table areas
+ DetectionAlgorithm detector = new NurminenDetectionAlgorithm();
+ List guesses = detector.detect(page);
+ List tables = new ArrayList<>();
+
+ for (Rectangle guessRect : guesses) {
+ Page guess = page.getArea(guessRect);
+ tables.addAll(basicExtractor.extract(guess));
+ }
+ return tables;
+ }
+
+ if (verticalRulingPositions != null) {
+ List absoluteRulingPositions;
+
+ if (this.verticalRulingPositionsRelative) {
+ // convert relative to absolute
+ absoluteRulingPositions = new ArrayList<>(verticalRulingPositions.size());
+ for (float relative : this.verticalRulingPositions) {
+ float absolute = (float) (relative / 100.0 * page.getWidth());
+ absoluteRulingPositions.add(absolute);
+ }
+ } else {
+ absoluteRulingPositions = this.verticalRulingPositions;
+ }
+ return basicExtractor.extract(page, absoluteRulingPositions);
+ }
+
+ return basicExtractor.extract(page);
+ }
+
+ public List extractTablesSpreadsheet(Page page) {
+ // TODO add useLineReturns
+ return spreadsheetExtractor.extract(page);
+ }
+ }
+
+ private void writeTables(List tables, Appendable out) throws IOException {
+ Writer writer = null;
+ switch (outputFormat) {
+ case CSV:
+ writer = new CSVWriter();
+ break;
+ case JSON:
+ writer = new JSONWriter();
+ break;
+ case TSV:
+ writer = new TSVWriter();
+ break;
+ }
+ writer.write(out, tables);
+ }
+
+ private String getOutputFilename(File pdfFile) {
+ String extension = ".csv";
+ switch (outputFormat) {
+ case CSV:
+ extension = ".csv";
+ break;
+ case JSON:
+ extension = ".json";
+ break;
+ case TSV:
+ extension = ".tsv";
+ break;
+ }
+ return pdfFile.getPath().replaceFirst("(\\.pdf|)$", extension);
+ }
+
private enum OutputFormat {
CSV,
TSV,
JSON;
-
+
static String[] formatNames() {
OutputFormat[] values = OutputFormat.values();
String[] rv = new String[values.length];
@@ -267,25 +504,24 @@ static String[] formatNames() {
}
return rv;
}
-
}
-
+
private enum ExtractionMethod {
BASIC,
SPREADSHEET,
DECIDE
}
-
+
private class DebugOutput {
private boolean debugEnabled;
public DebugOutput(boolean debug) {
this.debugEnabled = debug;
}
-
+
public void debug(String msg) {
if (this.debugEnabled) {
- System.err.println(msg);
+ System.err.println(msg);
}
}
}
diff --git a/src/main/java/technology/tabula/DummyGraphics2D.java b/src/main/java/technology/tabula/DummyGraphics2D.java
deleted file mode 100644
index 88026fec..00000000
--- a/src/main/java/technology/tabula/DummyGraphics2D.java
+++ /dev/null
@@ -1,461 +0,0 @@
-package technology.tabula;
-
-import java.awt.Color;
-import java.awt.Composite;
-import java.awt.Font;
-import java.awt.FontMetrics;
-import java.awt.Graphics;
-import java.awt.Graphics2D;
-import java.awt.GraphicsConfiguration;
-import java.awt.Image;
-import java.awt.Paint;
-import java.awt.Rectangle;
-import java.awt.RenderingHints;
-import java.awt.RenderingHints.Key;
-import java.awt.Shape;
-import java.awt.Stroke;
-import java.awt.font.FontRenderContext;
-import java.awt.font.GlyphVector;
-import java.awt.geom.AffineTransform;
-import java.awt.image.BufferedImage;
-import java.awt.image.BufferedImageOp;
-import java.awt.image.ImageObserver;
-import java.awt.image.RenderedImage;
-import java.awt.image.renderable.RenderableImage;
-import java.text.AttributedCharacterIterator;
-import java.util.Map;
-
-public class DummyGraphics2D extends Graphics2D {
-
- @Override
- public void addRenderingHints(Map, ?> hints) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void clip(Shape s) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void draw(Shape s) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawGlyphVector(GlyphVector g, float x, float y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public boolean drawImage(Image img, AffineTransform xform, ImageObserver obs) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public void drawImage(BufferedImage img, BufferedImageOp op, int x, int y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawRenderableImage(RenderableImage img, AffineTransform xform) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawRenderedImage(RenderedImage img, AffineTransform xform) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawString(String str, int x, int y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawString(String str, float x, float y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawString(AttributedCharacterIterator iterator, int x, int y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawString(AttributedCharacterIterator iterator, float x,
- float y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fill(Shape s) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public Color getBackground() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Composite getComposite() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public GraphicsConfiguration getDeviceConfiguration() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public FontRenderContext getFontRenderContext() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Paint getPaint() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Object getRenderingHint(Key hintKey) {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public RenderingHints getRenderingHints() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Stroke getStroke() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public AffineTransform getTransform() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public boolean hit(Rectangle rect, Shape s, boolean onStroke) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public void rotate(double theta) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void rotate(double theta, double x, double y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void scale(double sx, double sy) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setBackground(Color color) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setComposite(Composite comp) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setPaint(Paint paint) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setRenderingHint(Key hintKey, Object hintValue) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setRenderingHints(Map, ?> hints) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setStroke(Stroke s) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setTransform(AffineTransform Tx) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void shear(double shx, double shy) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void transform(AffineTransform Tx) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void translate(int x, int y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void translate(double tx, double ty) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void clearRect(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void clipRect(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void copyArea(int x, int y, int width, int height, int dx, int dy) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public Graphics create() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public void dispose() {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawArc(int x, int y, int width, int height, int startAngle,
- int arcAngle) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public boolean drawImage(Image img, int x, int y, ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public boolean drawImage(Image img, int x, int y, Color bgcolor,
- ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public boolean drawImage(Image img, int x, int y, int width, int height,
- ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public boolean drawImage(Image img, int x, int y, int width, int height,
- Color bgcolor, ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public boolean drawImage(Image img, int dx1, int dy1, int dx2, int dy2,
- int sx1, int sy1, int sx2, int sy2, ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public boolean drawImage(Image img, int dx1, int dy1, int dx2, int dy2,
- int sx1, int sy1, int sx2, int sy2, Color bgcolor,
- ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public void drawLine(int x1, int y1, int x2, int y2) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawOval(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawPolygon(int[] xPoints, int[] yPoints, int nPoints) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawPolyline(int[] xPoints, int[] yPoints, int nPoints) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawRoundRect(int x, int y, int width, int height,
- int arcWidth, int arcHeight) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fillArc(int x, int y, int width, int height, int startAngle,
- int arcAngle) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fillOval(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fillPolygon(int[] xPoints, int[] yPoints, int nPoints) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fillRect(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fillRoundRect(int x, int y, int width, int height,
- int arcWidth, int arcHeight) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public Shape getClip() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Rectangle getClipBounds() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Color getColor() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Font getFont() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public FontMetrics getFontMetrics(Font f) {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public void setClip(Shape clip) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setClip(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setColor(Color c) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setFont(Font font) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setPaintMode() {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setXORMode(Color c1) {
- // TODO Auto-generated method stub
-
- }
-
-}
diff --git a/src/main/java/technology/tabula/HasText.java b/src/main/java/technology/tabula/HasText.java
index 6f375dbc..1a9bda99 100644
--- a/src/main/java/technology/tabula/HasText.java
+++ b/src/main/java/technology/tabula/HasText.java
@@ -1,7 +1,8 @@
package technology.tabula;
public interface HasText {
-
- String getText();
+
+ String getText();
+ String getText(boolean useLineReturns);
}
diff --git a/src/main/java/technology/tabula/Line.java b/src/main/java/technology/tabula/Line.java
index ed2f6895..31d10529 100644
--- a/src/main/java/technology/tabula/Line.java
+++ b/src/main/java/technology/tabula/Line.java
@@ -8,7 +8,7 @@
@SuppressWarnings("serial")
public class Line extends Rectangle {
- List textChunks = new ArrayList();
+ List textChunks = new ArrayList<>();
public static final Character[] WHITE_SPACE_CHARS = { ' ', '\t', '\r', '\n', '\f' };
@@ -52,7 +52,7 @@ public void addTextChunk(TextChunk textChunk) {
public String toString() {
StringBuilder sb = new StringBuilder();
String s = super.toString();
- sb.append(s.substring(0, s.length() - 1));
+ sb.append(s, 0, s.length() - 1);
sb.append(",chunks=");
for (TextChunk te: this.textChunks) {
sb.append("'" + te.getText() + "', ");
diff --git a/src/main/java/technology/tabula/ObjectExtractor.java b/src/main/java/technology/tabula/ObjectExtractor.java
index b38d393d..9f3f6a03 100644
--- a/src/main/java/technology/tabula/ObjectExtractor.java
+++ b/src/main/java/technology/tabula/ObjectExtractor.java
@@ -1,445 +1,73 @@
package technology.tabula;
-import java.awt.Image;
-import java.awt.Shape;
-import java.awt.event.KeyEvent;
-import java.awt.geom.AffineTransform;
-import java.awt.geom.GeneralPath;
-import java.awt.geom.Line2D;
-import java.awt.geom.PathIterator;
-import java.awt.geom.Point2D;
-import java.awt.geom.Rectangle2D;
import java.io.IOException;
-import java.lang.reflect.Field;
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.List;
-import org.apache.pdfbox.exceptions.CryptographyException;
-import org.apache.pdfbox.pdfviewer.PageDrawer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.pdmodel.common.PDRectangle;
-import org.apache.pdfbox.pdmodel.common.PDStream;
-import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
-import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
-import org.apache.pdfbox.pdmodel.font.PDFont;
-import org.apache.pdfbox.pdmodel.font.PDType3Font;
-import org.apache.pdfbox.pdmodel.graphics.PDGraphicsState;
-import org.apache.pdfbox.pdmodel.text.PDTextState;
-import org.apache.pdfbox.util.TextPosition;
-public class ObjectExtractor extends org.apache.pdfbox.pdfviewer.PageDrawer {
+public class ObjectExtractor implements java.io.Closeable {
- class PointComparator implements Comparator {
- @Override
- public int compare(Point2D o1, Point2D o2) {
- float o1X = Utils.round(o1.getX(), 2);
- float o1Y = Utils.round(o1.getY(), 2);
- float o2X = Utils.round(o2.getX(), 2);
- float o2Y = Utils.round(o2.getY(), 2);
+ private final PDDocument pdfDocument;
- if (o1Y > o2Y)
- return 1;
- if (o1Y < o2Y)
- return -1;
- if (o1X > o2X)
- return 1;
- if (o1X < o2X)
- return -1;
- return 0;
- }
+ public ObjectExtractor(PDDocument pdfDocument) {
+ this.pdfDocument = pdfDocument;
}
- private static final char[] spaceLikeChars = { ' ', '-', '1', 'i' };
- private static final String NBSP = "\u00A0";
-
- private float minCharWidth = Float.MAX_VALUE,
- minCharHeight = Float.MAX_VALUE;
- private List characters;
- private List rulings;
- private RectangleSpatialIndex spatialIndex;
- private AffineTransform pageTransform;
- private Shape clippingPath;
- public List clippingPaths = new ArrayList();
- private boolean debugClippingPaths = false;
- private Rectangle2D transformedClippingPathBounds;
- private Shape transformedClippingPath;
- private boolean extractRulingLines = true;
- private final PDDocument pdf_document;
- protected List pdf_document_pages;
-
-
- public ObjectExtractor(PDDocument pdf_document) throws IOException {
- this(pdf_document, null);
- }
-
- public ObjectExtractor(PDDocument pdf_document, String password)
- throws IOException {
- super();
-
- // patch PageDrawer: dummy Graphics2D context so some drawing operators don't complain
- try {
- Field field = PageDrawer.class.getDeclaredField("graphics");
- field.setAccessible(true);
- field.set(this, new DummyGraphics2D());
- }
- catch (Exception e1) {
- }
-
- if (pdf_document.isEncrypted()) {
- try {
- pdf_document
- .openProtection(new StandardDecryptionMaterial(password));
- } catch (BadSecurityHandlerException e) {
- // TODO Auto-generated catch block
- throw new IOException("BadSecurityHandler");
- } catch (CryptographyException e) {
- throw new IOException("Document is encrypted");
- }
- }
- this.pdf_document = pdf_document;
- this.pdf_document_pages = this.pdf_document.getDocumentCatalog()
- .getAllPages();
-
- }
-
-
- protected Page extractPage(Integer page_number) throws IOException {
-
- if (page_number - 1 > this.pdf_document_pages.size() || page_number < 1) {
- throw new java.lang.IndexOutOfBoundsException(
- "Page number does not exist");
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ protected Page extractPage(Integer pageNumber) throws IOException {
+ if (pageNumber > pdfDocument.getNumberOfPages() || pageNumber < 1) {
+ throw new java.lang.IndexOutOfBoundsException("Page number does not exist.");
}
+ PDPage page = pdfDocument.getPage(pageNumber - 1);
- PDPage p = (PDPage) this.pdf_document_pages.get(page_number - 1);
- PDStream contents = p.getContents();
+ ObjectExtractorStreamEngine streamEngine = new ObjectExtractorStreamEngine(page);
+ streamEngine.processPage(page);
- if (contents == null) {
- return null;
- }
- this.clear();
-
- this.drawPage(p);
+ TextStripper textStripper = new TextStripper(pdfDocument, pageNumber);
+ textStripper.process();
- Utils.sort(this.characters);
+ Utils.sort(textStripper.getTextElements(), Rectangle.ILL_DEFINED_ORDER);
- float w, h;
- int pageRotation = p.findRotation();
- if (Math.abs(pageRotation) == 90 || Math.abs(pageRotation) == 270) {
- w = p.findCropBox().getHeight();
- h = p.findCropBox().getWidth();
- }
- else {
- w = p.findCropBox().getWidth();
- h = p.findCropBox().getHeight();
+ float width, height;
+ int rotation = page.getRotation();
+ if (Math.abs(rotation) == 90 || Math.abs(rotation) == 270) {
+ width = page.getCropBox().getHeight();
+ height = page.getCropBox().getWidth();
+ } else {
+ width = page.getCropBox().getWidth();
+ height = page.getCropBox().getHeight();
}
- return new Page(0, 0, w, h, pageRotation, page_number, this.characters,
- this.getRulings(), this.minCharWidth, this.minCharHeight,
- this.spatialIndex);
+ return Page.Builder.newInstance()
+ .withPageDims(PageDims.of(0, 0, width, height))
+ .withRotation(rotation)
+ .withNumber(pageNumber)
+ .withPdPage(page)
+ .withPdDocument(pdfDocument)
+ .withRulings(streamEngine.rulings)
+ .withTextElements(textStripper.getTextElements())
+ .withMinCharWidth(textStripper.getMinCharWidth())
+ .withMinCharHeight(textStripper.getMinCharHeight())
+ .withIndex(textStripper.getSpatialIndex())
+ .build();
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
public PageIterator extract(Iterable pages) {
return new PageIterator(this, pages);
}
public PageIterator extract() {
- return extract(Utils.range(1, this.pdf_document_pages.size() + 1));
+ return extract(Utils.range(1, pdfDocument.getNumberOfPages() + 1));
}
public Page extract(int pageNumber) {
return extract(Utils.range(pageNumber, pageNumber + 1)).next();
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
public void close() throws IOException {
- this.pdf_document.close();
- }
-
- public void drawPage(PDPage p) throws IOException {
- page = p;
- PDStream contents = p.getContents();
- if (contents != null) {
- ensurePageSize();
- this.processStream(p, p.findResources(), contents.getStream());
- }
- }
-
- private void ensurePageSize() {
- if (this.pageSize == null && this.page != null) {
- PDRectangle cropBox = this.page.findCropBox();
- this.pageSize = cropBox == null ? null : cropBox
- .createDimension();
- }
- }
-
- private void clear() {
- this.characters = new ArrayList();
- this.rulings = new ArrayList();
- this.pageTransform = null;
- this.spatialIndex = new RectangleSpatialIndex();
- this.minCharWidth = Float.MAX_VALUE;
- this.minCharHeight = Float.MAX_VALUE;
- }
-
- @Override
- public void drawImage(Image awtImage, AffineTransform at) {
- // we just ignore images (for now)
- }
-
- public void strokeOrFillPath(boolean isFill) {
- GeneralPath path = this.getLinePath();
-
- if (!this.extractRulingLines) {
- this.getLinePath().reset();
- return;
- }
-
- PathIterator pi = path.getPathIterator(this.getPageTransform());
- float[] c = new float[6];
- int currentSegment;
-
- // skip paths whose first operation is not a MOVETO
- // or contains operations other than LINETO, MOVETO or CLOSE
- if ((pi.currentSegment(c) != PathIterator.SEG_MOVETO)) {
- path.reset();
- return;
- }
- pi.next();
- while (!pi.isDone()) {
- currentSegment = pi.currentSegment(c);
- if (currentSegment != PathIterator.SEG_LINETO
- && currentSegment != PathIterator.SEG_CLOSE
- && currentSegment != PathIterator.SEG_MOVETO) {
- path.reset();
- return;
- }
- pi.next();
- }
-
- // TODO: how to implement color filter?
-
- // skip the first path operation and save it as the starting position
- float[] first = new float[6];
- pi = path.getPathIterator(this.getPageTransform());
- pi.currentSegment(first);
- // last move
- Point2D.Float start_pos = new Point2D.Float(Utils.round(first[0], 2), Utils.round(first[1], 2));
- Point2D.Float last_move = start_pos;
- Point2D.Float end_pos = null;
- Line2D.Float line;
- PointComparator pc = new PointComparator();
-
- while (!pi.isDone()) {
- pi.next();
- currentSegment = pi.currentSegment(c);
- switch (currentSegment) {
- case PathIterator.SEG_LINETO:
- end_pos = new Point2D.Float(c[0], c[1]);
-
- line = pc.compare(start_pos, end_pos) == -1 ? new Line2D.Float(
- start_pos, end_pos) : new Line2D.Float(end_pos,
- start_pos);
-
- if (line.intersects(this.currentClippingPath())) {
- Ruling r = new Ruling(line.getP1(), line.getP2())
- .intersect(this.currentClippingPath());
-
- if (r.length() > 0.01) {
- this.rulings.add(r);
- }
- }
- break;
- case PathIterator.SEG_MOVETO:
- last_move = new Point2D.Float(c[0], c[1]);
- end_pos = last_move;
- break;
- case PathIterator.SEG_CLOSE:
- // according to PathIterator docs:
- // "the preceding subpath should be closed by appending a line
- // segment
- // back to the point corresponding to the most recent
- // SEG_MOVETO."
- line = pc.compare(end_pos, last_move) == -1 ? new Line2D.Float(
- end_pos, last_move) : new Line2D.Float(last_move,
- end_pos);
-
- if (line.intersects(this.currentClippingPath())) {
- Ruling r = new Ruling(line.getP1(), line.getP2())
- .intersect(this.currentClippingPath());
-
- if (r.length() > 0.01) {
- this.rulings.add(r);
- }
- }
- break;
- }
- start_pos = end_pos;
- }
- path.reset();
- }
-
- @Override
- public void strokePath() throws IOException {
- this.strokeOrFillPath(false);
- }
-
- @Override
- public void fillPath(int windingRule) throws IOException {
- //
- // float[] color_comps =
- // this.getGraphicsState().getNonStrokingColor().getJavaColor().getRGBColorComponents(null);
- float[] color = this.getGraphicsState().getNonStrokingColor().getJavaColor().getComponents(null);
- // TODO use color_comps as filter_by_color
- this.strokeOrFillPath(true);
- }
-
- private float currentSpaceWidth() {
- PDGraphicsState gs = this.getGraphicsState();
- PDTextState ts = gs.getTextState();
- PDFont font = ts.getFont();
- float fontSizeText = ts.getFontSize();
- float horizontalScalingText = ts.getHorizontalScalingPercent() / 100.0f;
- float spaceWidthText = 1000;
-
- if (font instanceof PDType3Font) {
- // TODO WHAT?
- }
-
- for (int i = 0; i < spaceLikeChars.length; i++) {
- spaceWidthText = font.getFontWidth(spaceLikeChars[i]);
- if (spaceWidthText > 0)
- break;
- }
-
- float ctm00 = gs.getCurrentTransformationMatrix().getValue(0, 0);
-
- return (float) ((spaceWidthText / 1000.0) * fontSizeText
- * horizontalScalingText * (ctm00 == 0 ? 1 : ctm00));
- }
-
- @Override
- protected void processTextPosition(TextPosition textPosition) {
- String c = textPosition.getCharacter();
-
- // if c not printable, return
- if (!isPrintable(c)) {
- return;
- }
-
- Float h = textPosition.getHeightDir();
-
- if (c.equals(NBSP)) { // replace non-breaking space for space
- c = " ";
- }
-
- float wos = textPosition.getWidthOfSpace();
-
- TextElement te = new TextElement(
- Utils.round(textPosition.getYDirAdj() - h, 2),
- Utils.round(textPosition.getXDirAdj(), 2),
- Utils.round(textPosition.getWidthDirAdj(), 2),
- Utils.round(textPosition.getHeightDir(), 2),
- textPosition.getFont(),
- textPosition.getFontSize(),
- c,
- // workaround a possible bug in PDFBox:
- // https://issues.apache.org/jira/browse/PDFBOX-1755
- (Float.isNaN(wos) || wos == 0) ? this.currentSpaceWidth() : wos,
- textPosition.getDir());
-
- if (this.currentClippingPath().intersects(te)) {
-
- this.minCharWidth = (float) Math.min(this.minCharWidth, te.getWidth());
- this.minCharHeight = (float) Math.min(this.minCharHeight, te.getHeight());
-
- this.spatialIndex.add(te);
- this.characters.add(te);
- }
-
- if (this.isDebugClippingPaths() && !this.clippingPaths.contains(this.currentClippingPath())) {
- this.clippingPaths.add(this.currentClippingPath());
- }
-
- }
-
- public float getMinCharWidth() {
- return minCharWidth;
- }
-
- public float getMinCharHeight() {
- return minCharHeight;
- }
-
- public AffineTransform getPageTransform() {
-
- if (this.pageTransform != null) {
- return this.pageTransform;
- }
-
- PDRectangle cb = page.findCropBox();
- int rotation = Math.abs(page.findRotation());
-
- this.pageTransform = new AffineTransform();
-
- if (rotation == 90 || rotation == 270) {
- this.pageTransform = AffineTransform.getRotateInstance(rotation * (Math.PI / 180.0), 0, 0);
- this.pageTransform.concatenate(AffineTransform.getScaleInstance(1, -1));
- this.pageTransform.concatenate(AffineTransform.getTranslateInstance(0, cb.getHeight()));
- this.pageTransform.concatenate(AffineTransform.getScaleInstance(1, -1));
- }
- return this.pageTransform;
- }
-
- //public Rectangle2D currentClippingPath() {
- public Rectangle2D currentClippingPath() {
- // Shape cp = this.getGraphicsState().getCurrentClippingPath();
- // if (cp == this.clippingPath) {
- // return this.transformedClippingPathBounds;
- // }
-
- this.clippingPath = this.getGraphicsState().getCurrentClippingPath();
- this.transformedClippingPath = this.getPageTransform()
- .createTransformedShape(this.clippingPath);
- this.transformedClippingPathBounds = this.transformedClippingPath
- .getBounds2D();
-
- return this.transformedClippingPathBounds;
- }
-
- public boolean isExtractRulingLines() {
- return extractRulingLines;
- }
-
- public void setExtractRulingLines(boolean extractRulingLines) {
- this.extractRulingLines = extractRulingLines;
+ pdfDocument.close();
}
-
- public List getRulings() {
- return rulings;
- }
-
- public List getCharacters() {
- return characters;
- }
-
- private static boolean isPrintable(String s) {
- Character c = s.charAt(0);
- Character.UnicodeBlock block = Character.UnicodeBlock.of(c);
- return (!Character.isISOControl(c)) && c != KeyEvent.CHAR_UNDEFINED
- && block != null && block != Character.UnicodeBlock.SPECIALS;
- }
-
- public boolean isDebugClippingPaths() {
- return debugClippingPaths;
- }
-
- public void setDebugClippingPaths(boolean debugClippingPaths) {
- this.debugClippingPaths = debugClippingPaths;
- }
-
- public int getPageCount() {
- return this.pdf_document_pages.size();
- }
-
+
}
diff --git a/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java b/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java
new file mode 100644
index 00000000..9907eca1
--- /dev/null
+++ b/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java
@@ -0,0 +1,271 @@
+package technology.tabula;
+
+import java.awt.Shape;
+import java.awt.geom.AffineTransform;
+import java.awt.geom.GeneralPath;
+import java.awt.geom.Line2D;
+import java.awt.geom.PathIterator;
+import java.awt.geom.Point2D;
+import java.awt.geom.Rectangle2D;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.PDRectangle;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import static java.awt.geom.PathIterator.*;
+
+class ObjectExtractorStreamEngine extends PDFGraphicsStreamEngine {
+
+ protected List rulings;
+ private AffineTransform pageTransform;
+ private boolean extractRulingLines = true;
+ private Logger logger;
+ private int clipWindingRule = -1;
+ private GeneralPath currentPath = new GeneralPath();
+
+ private static final float RULING_MINIMUM_LENGTH = 0.01f;
+
+ protected ObjectExtractorStreamEngine(PDPage page) {
+ super(page);
+ logger = LoggerFactory.getLogger(ObjectExtractorStreamEngine.class);
+ rulings = new ArrayList<>();
+
+ // Calculate page transform:
+ pageTransform = new AffineTransform();
+ PDRectangle pageCropBox = getPage().getCropBox();
+ int rotationAngleInDegrees = getPage().getRotation();
+
+ if (Math.abs(rotationAngleInDegrees) == 90 || Math.abs(rotationAngleInDegrees) == 270) {
+ double rotationAngleInRadians = rotationAngleInDegrees * (Math.PI / 180.0);
+ pageTransform = AffineTransform.getRotateInstance(rotationAngleInRadians, 0, 0);
+ } else {
+ double deltaX = 0;
+ double deltaY = pageCropBox.getHeight();
+ pageTransform.concatenate(AffineTransform.getTranslateInstance(deltaX, deltaY));
+ }
+
+ pageTransform.concatenate(AffineTransform.getScaleInstance(1, -1));
+ pageTransform.translate(-pageCropBox.getLowerLeftX(), -pageCropBox.getLowerLeftY());
+ }
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ @Override
+ public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) {
+ currentPath.moveTo((float) p0.getX(), (float) p0.getY());
+ currentPath.lineTo((float) p1.getX(), (float) p1.getY());
+ currentPath.lineTo((float) p2.getX(), (float) p2.getY());
+ currentPath.lineTo((float) p3.getX(), (float) p3.getY());
+ currentPath.closePath();
+ }
+
+ @Override
+ public void clip(int windingRule) {
+ // The clipping path will not be updated until the succeeding painting
+ // operator is called.
+ clipWindingRule = windingRule;
+ }
+
+ @Override
+ public void closePath() {
+ currentPath.closePath();
+ }
+
+ @Override
+ public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) {
+ currentPath.curveTo(x1, y1, x2, y2, x3, y3);
+ }
+
+ @Override
+ public void drawImage(PDImage arg0) {}
+
+ @Override
+ public void endPath() {
+ if (clipWindingRule != -1) {
+ currentPath.setWindingRule(clipWindingRule);
+ getGraphicsState().intersectClippingPath(currentPath);
+ clipWindingRule = -1;
+ }
+ currentPath.reset();
+ }
+
+ @Override
+ public void fillAndStrokePath(int arg0) {
+ strokeOrFillPath(true);
+ }
+
+ @Override
+ public void fillPath(int arg0) {
+ strokeOrFillPath(true);
+ }
+
+ @Override
+ public Point2D getCurrentPoint() {
+ return currentPath.getCurrentPoint();
+ }
+
+ @Override
+ public void lineTo(float x, float y) {
+ currentPath.lineTo(x, y);
+ }
+
+ @Override
+ public void moveTo(float x, float y) {
+ currentPath.moveTo(x, y);
+ }
+
+ @Override
+ public void shadingFill(COSName arg0) {}
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ @Override
+ public void strokePath() {
+ strokeOrFillPath(false);
+ }
+
+ private void strokeOrFillPath(boolean isFill) {
+ if (!extractRulingLines) {
+ currentPath.reset();
+ return;
+ }
+
+ boolean didNotPassedTheFilter = filterPathBySegmentType();
+ if (didNotPassedTheFilter) return;
+
+ // TODO: how to implement color filter?
+
+ // Skip the first path operation and save it as the starting point.
+ PathIterator pathIterator = currentPath.getPathIterator(getPageTransform());
+
+ float[] coordinates = new float[6];
+ int currentSegment;
+
+ Point2D.Float startPoint = getStartPoint(pathIterator);
+ Point2D.Float last_move = startPoint;
+ Point2D.Float endPoint = null;
+ Line2D.Float line;
+ PointComparator pointComparator = new PointComparator();
+
+ while (!pathIterator.isDone()) {
+ pathIterator.next();
+ // This can be the last segment, when pathIterator.isDone, but we need to
+ // process it otherwise us-017.pdf fails the last value.
+ try {
+ currentSegment = pathIterator.currentSegment(coordinates);
+ } catch (IndexOutOfBoundsException ex) {
+ continue;
+ }
+ switch (currentSegment) {
+ case SEG_LINETO:
+ endPoint = new Point2D.Float(coordinates[0], coordinates[1]);
+ if (startPoint == null || endPoint == null) {
+ break;
+ }
+ line = getLineBetween(startPoint, endPoint, pointComparator);
+ verifyLineIntersectsClipping(line);
+ break;
+ case SEG_MOVETO:
+ last_move = new Point2D.Float(coordinates[0], coordinates[1]);
+ endPoint = last_move;
+ break;
+ case SEG_CLOSE:
+ // According to PathIterator docs:
+ // "The preceding sub-path should be closed by appending a line
+ // segment back to the point corresponding to the most recent
+ // SEG_MOVETO."
+ if (startPoint == null || endPoint == null) {
+ break;
+ }
+ line = getLineBetween(endPoint, last_move, pointComparator);
+ verifyLineIntersectsClipping(line);
+ break;
+ }
+ startPoint = endPoint;
+ }
+ currentPath.reset();
+ }
+
+ private boolean filterPathBySegmentType() {
+ PathIterator pathIterator = currentPath.getPathIterator(pageTransform);
+ float[] coordinates = new float[6];
+ int currentSegmentType = pathIterator.currentSegment(coordinates);
+ if (currentSegmentType != SEG_MOVETO) {
+ currentPath.reset();
+ return true;
+ }
+ pathIterator.next();
+ while (!pathIterator.isDone()) {
+ currentSegmentType = pathIterator.currentSegment(coordinates);
+ if (currentSegmentType != SEG_LINETO && currentSegmentType != SEG_CLOSE && currentSegmentType != SEG_MOVETO) {
+ currentPath.reset();
+ return true;
+ }
+ pathIterator.next();
+ }
+ return false;
+ }
+
+ private Point2D.Float getStartPoint(PathIterator pathIterator) {
+ float[] startPointCoordinates = new float[6];
+ pathIterator.currentSegment(startPointCoordinates);
+ float x = Utils.round(startPointCoordinates[0], 2);
+ float y = Utils.round(startPointCoordinates[1], 2);
+ return new Point2D.Float(x, y);
+ }
+
+ private Line2D.Float getLineBetween(Point2D.Float pointA, Point2D.Float pointB, PointComparator pointComparator) {
+ if (pointComparator.compare(pointA, pointB) == -1) {
+ return new Line2D.Float(pointA, pointB);
+ }
+ return new Line2D.Float(pointB, pointA);
+ }
+
+ private void verifyLineIntersectsClipping(Line2D.Float line) {
+ Rectangle2D currentClippingPath = currentClippingPath();
+ if (line.intersects(currentClippingPath)) {
+ Ruling ruling = new Ruling(line.getP1(), line.getP2()).intersect(currentClippingPath);
+ if (ruling.length() > RULING_MINIMUM_LENGTH) {
+ rulings.add(ruling);
+ }
+ }
+ }
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ public AffineTransform getPageTransform() {
+ return pageTransform;
+ }
+
+ public Rectangle2D currentClippingPath() {
+ Shape currentClippingPath = getGraphicsState().getCurrentClippingPath();
+ Shape transformedClippingPath = getPageTransform().createTransformedShape(currentClippingPath);
+ return transformedClippingPath.getBounds2D();
+ }
+
+ // TODO: repeated in SpreadsheetExtractionAlgorithm.
+ class PointComparator implements Comparator {
+ @Override
+ public int compare(Point2D p1, Point2D p2) {
+ float p1X = Utils.round(p1.getX(), 2);
+ float p1Y = Utils.round(p1.getY(), 2);
+ float p2X = Utils.round(p2.getX(), 2);
+ float p2Y = Utils.round(p2.getY(), 2);
+
+ if (p1Y > p2Y)
+ return 1;
+ if (p1Y < p2Y)
+ return -1;
+ if (p1X > p2X)
+ return 1;
+ if (p1X < p2X)
+ return -1;
+ return 0;
+ }
+ }
+
+}
diff --git a/src/main/java/technology/tabula/Page.java b/src/main/java/technology/tabula/Page.java
index 624a901a..ed74d14a 100644
--- a/src/main/java/technology/tabula/Page.java
+++ b/src/main/java/technology/tabula/Page.java
@@ -2,293 +2,415 @@
import java.awt.geom.Point2D;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+
+import static java.lang.Float.compare;
+import static java.util.Collections.min;
@SuppressWarnings("serial")
// TODO: this class should probably be called "PageArea" or something like that
public class Page extends Rectangle {
+ private int number;
private Integer rotation;
- private int pageNumber;
- private List texts;
- private List rulings, cleanRulings = null, verticalRulingLines = null, horizontalRulingLines = null;
private float minCharWidth;
private float minCharHeight;
- private RectangleSpatialIndex spatial_index;
- public Page(float top, float left, float width, float height, int rotation, int page_number) {
- super(top, left, width, height);
+ private List textElements;
+
+ // TODO: Create a class for 'List ' that encapsulates all of these lists and their behaviors?
+ private List rulings,
+ cleanRulings = null,
+ verticalRulingLines = null,
+ horizontalRulingLines = null;
+
+ private PDPage pdPage;
+ private PDDocument pdDoc;
+
+ private RectangleSpatialIndex spatialIndex;
+
+ private static final float DEFAULT_MIN_CHAR_LENGTH = 7;
+
+ private Page(
+ PageDims pageDims,
+ int rotation,
+ int number,
+ PDPage pdPage,
+ PDDocument doc,
+ List characters,
+ List rulings,
+ float minCharWidth,
+ float minCharHeight,
+ RectangleSpatialIndex index
+ ) {
+ super(pageDims.getTop(), pageDims.getLeft(), pageDims.getWidth(), pageDims.getHeight());
this.rotation = rotation;
- this.pageNumber = page_number;
+ this.number = number;
+ this.pdPage = pdPage;
+ this.pdDoc = doc;
+ this.textElements = characters;
+ this.rulings = rulings;
+ this.minCharWidth = minCharWidth;
+ this.minCharHeight = minCharHeight;
+ this.spatialIndex = index;
}
-
- public Page(float top, float left, float width, float height, int rotation, int page_number,
- List characters, List rulings) {
- this(top, left, width, height, rotation, page_number);
- this.texts = characters;
- this.rulings = rulings;
+ /**
+ *
+ * @deprecated use {@link Builder} instead
+ */
+ @Deprecated
+ public Page(float top, float left, float width, float height, int rotation, int number, PDPage pdPage, PDDocument doc) {
+ super(top, left, width, height);
+ this.rotation = rotation;
+ this.number = number;
+ this.pdPage = pdPage;
+ this.pdDoc = doc;
+ }
+
+ /**
+ *
+ * @deprecated use {@link Builder} instead
+ */
+ public Page(float top, float left, float width, float height, int rotation, int number, PDPage pdPage, PDDocument doc,
+ List characters, List rulings) {
+ this(top, left, width, height, rotation, number, pdPage, doc);
+ this.textElements = characters;
+ this.rulings = rulings;
}
+ /**
+ *
+ * @deprecated use {@link Builder} instead
+ */
+ public Page(float top, float left, float width, float height, int rotation, int number, PDPage pdPage, PDDocument doc,
+ ObjectExtractorStreamEngine streamEngine, TextStripper textStripper) {
+ this(top, left, width, height, rotation, number, pdPage, doc, textStripper.getTextElements(), streamEngine.rulings);
+ this.minCharWidth = textStripper.getMinCharWidth();
+ this.minCharHeight = textStripper.getMinCharHeight();
+ this.spatialIndex = textStripper.getSpatialIndex();
+ }
- public Page(float top, float left, float width, float height, int rotation, int page_number,
- List characters, List rulings,
- float minCharWidth, float minCharHeight, RectangleSpatialIndex index) {
- this(top, left, width, height, rotation, page_number, characters, rulings);
- this.minCharHeight = minCharHeight;
- this.minCharWidth = minCharWidth;
- this.spatial_index = index;
+
+ /**
+ *
+ * @deprecated use {@link Builder} instead
+ */
+ public Page(float top, float left, float width, float height, int rotation, int number, PDPage pdPage, PDDocument doc,
+ List characters, List rulings,
+ float minCharWidth, float minCharHeight, RectangleSpatialIndex index) {
+ this(top, left, width, height, rotation, number, pdPage, doc, characters, rulings);
+ this.minCharHeight = minCharHeight;
+ this.minCharWidth = minCharWidth;
+ this.spatialIndex = index;
}
-
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
public Page getArea(Rectangle area) {
- List t = getText(area);
- Page rv = new Page(
- (float) area.getTop(),
- (float) area.getLeft(),
- (float) area.getWidth(),
- (float) area.getHeight(),
- rotation,
- pageNumber,
- t,
- Ruling.cropRulingsToArea(getRulings(), area),
-
- Collections.min(t, new Comparator() {
- @Override
- public int compare(TextElement te1, TextElement te2) {
- return java.lang.Float.compare(te1.width, te2.width);
- }}).width,
-
- Collections.min(t, new Comparator() {
- @Override
- public int compare(TextElement te1, TextElement te2) {
- return java.lang.Float.compare(te1.height, te2.height);
- }}).height,
-
- spatial_index);
-
- return rv;
+ List areaTextElements = getText(area);
+
+ float minimumCharWidth = getMinimumCharWidthFrom(areaTextElements);
+ float minimumCharHeight = getMinimumCharHeightFrom(areaTextElements);
+
+ final Page page = Page.Builder.newInstance()
+ .withPageDims(PageDims.of(area.getTop(), area.getLeft(), (float) area.getWidth(), (float) area.getHeight()))
+ .withRotation(rotation)
+ .withNumber(number)
+ .withPdPage(pdPage)
+ .withPdDocument(pdDoc)
+ .withTextElements(areaTextElements)
+ .withRulings(Ruling.cropRulingsToArea(getRulings(), area))
+ .withMinCharWidth(minimumCharWidth)
+ .withMinCharHeight(minimumCharHeight)
+ .withIndex(spatialIndex)
+ .build();
+
+ addBorderRulingsTo(page);
+
+ return page;
}
-
- public Page getArea(float top, float left, float bottom, float right) {
- Rectangle area = new Rectangle(top, left, right - left, bottom - top);
- return this.getArea(area);
+
+ private float getMinimumCharWidthFrom(List areaTextElements) {
+ if (!areaTextElements.isEmpty()) {
+ return min(areaTextElements, (te1, te2) -> compare(te1.width, te2.width)).width;
+ }
+ return DEFAULT_MIN_CHAR_LENGTH;
}
-
- public List getText() {
- return texts;
+
+ private float getMinimumCharHeightFrom(List areaTextElements) {
+ if (!areaTextElements.isEmpty()) {
+ return min(areaTextElements, (te1, te2) -> compare(te1.height, te2.height)).height;
+ }
+ return DEFAULT_MIN_CHAR_LENGTH;
}
-
- public List getText(Rectangle area) {
- return this.spatial_index.contains(area);
+
+ private void addBorderRulingsTo(Page page) {
+ Point2D.Double leftTop = new Point2D.Double(page.getLeft(), page.getTop()),
+ rightTop = new Point2D.Double(page.getRight(), page.getTop()),
+ rightBottom = new Point2D.Double(page.getRight(), page.getBottom()),
+ leftBottom = new Point2D.Double(page.getLeft(), page.getBottom());
+ page.addRuling(new Ruling(leftTop, rightTop));
+ page.addRuling(new Ruling(rightTop, rightBottom));
+ page.addRuling(new Ruling(rightBottom, leftBottom));
+ page.addRuling(new Ruling(leftBottom, leftTop));
}
-
- public List getText(float top, float left, float bottom, float right) {
- return this.getText(new Rectangle(top, left, right - left, bottom - top));
+
+ public Page getArea(float top, float left, float bottom, float right) {
+ Rectangle area = new Rectangle(top, left, right - left, bottom - top);
+ return getArea(area);
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
public Integer getRotation() {
return rotation;
}
public int getPageNumber() {
- return pageNumber;
+ return number;
+ }
+
+ /**
+ * @deprecated with no replacement
+ */
+ @Deprecated
+ public float getMinCharWidth() {
+ return minCharWidth;
+ }
+
+ /**
+ * @deprecated with no replacement
+ */
+ @Deprecated
+ public float getMinCharHeight() {
+ return minCharHeight;
+ }
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ public List getText() {
+ return textElements;
+ }
+
+ public List getText(Rectangle area) {
+ return spatialIndex.contains(area);
+ }
+
+ /**
+ * @deprecated use {@linkplain #getText(Rectangle)} instead
+ */
+ @Deprecated
+ public List getText(float top, float left, float bottom, float right) {
+ return getText(new Rectangle(top, left, right - left, bottom - top));
}
+ /**
+ * @deprecated use {@linkplain #getText()} instead
+ */
+ @Deprecated
public List getTexts() {
- return texts;
+ return textElements;
}
-
+
/**
* Returns the minimum bounding box that contains all the TextElements on this Page
- * @return
*/
public Rectangle getTextBounds() {
List texts = this.getText();
if (!texts.isEmpty()) {
return Utils.bounds(texts);
- }
- else {
+ } else {
return new Rectangle();
}
-
}
+ /**
+ * @deprecated with no replacement
+ */
+ @Deprecated
+ public boolean hasText() {
+ return textElements.size() > 0;
+ }
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
public List getRulings() {
- if (this.cleanRulings != null) {
- return this.cleanRulings;
+ if (cleanRulings != null) {
+ return cleanRulings;
}
-
- if (this.rulings == null || this.rulings.isEmpty()) {
- this.verticalRulingLines = new ArrayList();
- this.horizontalRulingLines = new ArrayList();
- return new ArrayList();
+
+ if (rulings == null || rulings.isEmpty()) {
+ verticalRulingLines = new ArrayList<>();
+ horizontalRulingLines = new ArrayList<>();
+ return new ArrayList<>();
}
-
- this.snapPoints();
-
- List vrs = new ArrayList();
- for (Ruling vr: this.rulings) {
- if (vr.vertical()) {
- vrs.add(vr);
+
+ // TODO: Move as a static method to the Ruling class?
+ Utils.snapPoints(rulings, minCharWidth, minCharHeight);
+
+ verticalRulingLines = getCollapsedVerticalRulings();
+ horizontalRulingLines = getCollapsedHorizontalRulings();
+
+ cleanRulings = new ArrayList<>(verticalRulingLines);
+ cleanRulings.addAll(horizontalRulingLines);
+
+ return cleanRulings;
+ }
+
+ // TODO: Create a class for 'List ' and encapsulate these behaviors within it?
+ private List getCollapsedVerticalRulings() {
+ List verticalRulings = new ArrayList<>();
+ for (Ruling ruling : rulings) {
+ if (ruling.vertical()) {
+ verticalRulings.add(ruling);
}
}
- this.verticalRulingLines = Ruling.collapseOrientedRulings(vrs);
-
- List hrs = new ArrayList();
- for (Ruling hr: this.rulings) {
- if (hr.horizontal()) {
- hrs.add(hr);
+ return Ruling.collapseOrientedRulings(verticalRulings);
+ }
+
+ private List getCollapsedHorizontalRulings() {
+ List horizontalRulings = new ArrayList<>();
+ for (Ruling ruling : rulings) {
+ if (ruling.horizontal()) {
+ horizontalRulings.add(ruling);
}
}
- this.horizontalRulingLines = Ruling.collapseOrientedRulings(hrs);
-
- this.cleanRulings = new ArrayList(this.verticalRulingLines);
- this.cleanRulings.addAll(this.horizontalRulingLines);
-
- return this.cleanRulings;
-
+ return Ruling.collapseOrientedRulings(horizontalRulings);
}
-
+
public List getVerticalRulings() {
- if (this.verticalRulingLines != null) {
- return this.verticalRulingLines;
+ if (verticalRulingLines != null) {
+ return verticalRulingLines;
}
- this.getRulings();
- return this.verticalRulingLines;
+ getRulings();
+ return verticalRulingLines;
}
-
+
public List getHorizontalRulings() {
- if (this.horizontalRulingLines != null) {
- return this.horizontalRulingLines;
+ if (horizontalRulingLines != null) {
+ return horizontalRulingLines;
}
- this.getRulings();
- return this.horizontalRulingLines;
+ getRulings();
+ return horizontalRulingLines;
}
-
- public void addRuling(Ruling r) {
- if (r.oblique()) {
- throw new UnsupportedOperationException("Can't add an oblique ruling");
+
+ public void addRuling(Ruling ruling) {
+ if (ruling.oblique()) {
+ throw new UnsupportedOperationException("Can't add an oblique ruling.");
}
- this.rulings.add(r);
- // clear caches
- this.verticalRulingLines = null;
- this.horizontalRulingLines = null;
- this.cleanRulings = null;
+ rulings.add(ruling);
+ // Clear caches:
+ verticalRulingLines = null;
+ horizontalRulingLines = null;
+ cleanRulings = null;
}
-
+
public List getUnprocessedRulings() {
- return this.rulings;
+ return rulings;
}
- public float getMinCharWidth() {
- return minCharWidth;
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+ public PDPage getPDPage() {
+ return pdPage;
}
- public float getMinCharHeight() {
- return minCharHeight;
+ public PDDocument getPDDoc() {
+ return pdDoc;
}
-
+
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
+
+ /**
+ * @deprecated with no replacement
+ */
+ @Deprecated
public RectangleSpatialIndex getSpatialIndex() {
- return this.spatial_index;
- }
-
- public boolean hasText() {
- return this.texts.size() > 0;
+ return spatialIndex;
}
-
-
- public void snapPoints() {
-
- // collect points and keep a Line -> p1,p2 map
- Map linesToPoints = new HashMap();
- List points = new ArrayList();
- for (Ruling r: this.rulings) {
- Point2D p1 = r.getP1();
- Point2D p2 = r.getP2();
- linesToPoints.put(r, new Point2D[] { p1, p2 });
- points.add(p1);
- points.add(p2);
+
+ public static class Builder {
+ private PageDims pageDims;
+ private int rotation;
+ private int number;
+ private PDPage pdPage;
+ private PDDocument pdDocument;
+ private List textElements;
+ private List rulings;
+ private float minCharWidth;
+ private float minCharHeight;
+ private RectangleSpatialIndex index;
+
+ private Builder() {}
+
+ public static Builder newInstance() {
+ return new Builder();
}
-
- // snap by X
- Collections.sort(points, new Comparator() {
- @Override
- public int compare(Point2D arg0, Point2D arg1) {
- return java.lang.Double.compare(arg0.getX(), arg1.getX());
- }
- });
-
- List> groupedPoints = new ArrayList>();
- groupedPoints.add(new ArrayList(Arrays.asList(new Point2D[] { points.get(0) })));
-
- for (Point2D p: points.subList(1, points.size() - 1)) {
- List last = groupedPoints.get(groupedPoints.size() - 1);
- if (Math.abs(p.getX() - last.get(0).getX()) < this.minCharWidth) {
- groupedPoints.get(groupedPoints.size() - 1).add(p);
- }
- else {
- groupedPoints.add(new ArrayList(Arrays.asList(new Point2D[] { p })));
- }
+
+ public Builder withPageDims(PageDims pageDims) {
+ this.pageDims = pageDims;
+
+ return this;
}
-
- for(List group: groupedPoints) {
- float avgLoc = 0;
- for(Point2D p: group) {
- avgLoc += p.getX();
- }
- avgLoc /= group.size();
- for(Point2D p: group) {
- p.setLocation(avgLoc, p.getY());
- }
+
+ public Builder withRotation(int rotation) {
+ this.rotation = rotation;
+
+ return this;
}
- // ---
- // snap by Y
- Collections.sort(points, new Comparator() {
- @Override
- public int compare(Point2D arg0, Point2D arg1) {
- return java.lang.Double.compare(arg0.getY(), arg1.getY());
- }
- });
-
- groupedPoints = new ArrayList>();
- groupedPoints.add(new ArrayList(Arrays.asList(new Point2D[] { points.get(0) })));
-
- for (Point2D p: points.subList(1, points.size() - 1)) {
- List last = groupedPoints.get(groupedPoints.size() - 1);
- if (Math.abs(p.getY() - last.get(0).getY()) < this.minCharHeight) {
- groupedPoints.get(groupedPoints.size() - 1).add(p);
- }
- else {
- groupedPoints.add(new ArrayList(Arrays.asList(new Point2D[] { p })));
- }
+ public Builder withNumber(int number) {
+ this.number = number;
+
+ return this;
}
-
- for(List group: groupedPoints) {
- float avgLoc = 0;
- for(Point2D p: group) {
- avgLoc += p.getY();
- }
- avgLoc /= group.size();
- for(Point2D p: group) {
- p.setLocation(p.getX(), avgLoc);
- }
+
+ public Builder withPdPage(PDPage pdPage) {
+ this.pdPage = pdPage;
+
+ return this;
+ }
+
+ public Builder withPdDocument(PDDocument pdDocument) {
+ this.pdDocument = pdDocument;
+
+ return this;
}
- // ---
-
- // finally, modify lines
- for(Map.Entry ltp: linesToPoints.entrySet()) {
- Point2D[] p = ltp.getValue();
- ltp.getKey().setLine(p[0], p[1]);
+
+ public Builder withTextElements(List textElements) {
+ this.textElements = textElements;
+
+ return this;
+ }
+
+ public Builder withRulings(List rulings) {
+ this.rulings = rulings;
+
+ return this;
+ }
+
+ public Builder withMinCharWidth(float minCharWidth) {
+ this.minCharWidth = minCharWidth;
+
+ return this;
+ }
+
+ public Builder withMinCharHeight(float minCharHeight) {
+ this.minCharHeight = minCharHeight;
+
+ return this;
+ }
+
+ public Builder withIndex(RectangleSpatialIndex index) {
+ this.index = index;
+
+ return this;
+ }
+
+ public Page build() {
+ return new Page(pageDims, rotation, number, pdPage, pdDocument, textElements, rulings, minCharWidth, minCharHeight, index);
}
}
-
-
}
diff --git a/src/main/java/technology/tabula/PageDims.java b/src/main/java/technology/tabula/PageDims.java
new file mode 100644
index 00000000..1598d125
--- /dev/null
+++ b/src/main/java/technology/tabula/PageDims.java
@@ -0,0 +1,35 @@
+package technology.tabula;
+
+public class PageDims {
+ private final float top;
+ private final float left;
+ private final float width;
+ private final float height;
+
+ private PageDims(final float top, final float left, final float width, final float height) {
+ this.top = top;
+ this.left = left;
+ this.width = width;
+ this.height = height;
+ }
+
+ public static PageDims of(final float top, final float left, final float width, final float height) {
+ return new PageDims(top, left, width, height);
+ }
+
+ public float getTop() {
+ return top;
+ }
+
+ public float getLeft() {
+ return left;
+ }
+
+ public float getWidth() {
+ return width;
+ }
+
+ public float getHeight() {
+ return height;
+ }
+}
diff --git a/src/main/java/technology/tabula/PageIterator.java b/src/main/java/technology/tabula/PageIterator.java
index 5fec2a77..052ed54a 100644
--- a/src/main/java/technology/tabula/PageIterator.java
+++ b/src/main/java/technology/tabula/PageIterator.java
@@ -5,39 +5,39 @@
public class PageIterator implements Iterator {
- private ObjectExtractor oe;
+ private ObjectExtractor objectExtractor;
private Iterator pageIndexIterator;
-
- public PageIterator(ObjectExtractor oe, Iterable pages) {
+
+ public PageIterator(ObjectExtractor objectExtractor, Iterable pages) {
super();
- this.oe = oe;
+ this.objectExtractor = objectExtractor;
this.pageIndexIterator = pages.iterator();
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
@Override
public boolean hasNext() {
- return this.pageIndexIterator.hasNext();
+ return pageIndexIterator.hasNext();
}
@Override
public Page next() {
- Page page = null;
+ Page nextPage = null;
if (!this.hasNext()) {
throw new IllegalStateException();
}
try {
- page = oe.extractPage(this.pageIndexIterator.next());
+ nextPage = objectExtractor.extractPage(pageIndexIterator.next());
} catch (IOException e) {
- // TODO Auto-generated catch block
e.printStackTrace();
}
- return page;
+ return nextPage;
}
+ // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - //
@Override
public void remove() {
throw new UnsupportedOperationException();
-
}
-}
\ No newline at end of file
+}
diff --git a/src/main/java/technology/tabula/Pair.java b/src/main/java/technology/tabula/Pair.java
new file mode 100644
index 00000000..d54cbbe5
--- /dev/null
+++ b/src/main/java/technology/tabula/Pair.java
@@ -0,0 +1,19 @@
+package technology.tabula;
+
+public class Pair {
+ private final L left;
+ private final R right;
+
+ public Pair(L left, R right) {
+ this.left = left;
+ this.right = right;
+ }
+
+ public L getLeft() {
+ return this.left;
+ }
+
+ public R getRight() {
+ return this.right;
+ }
+}
diff --git a/src/main/java/technology/tabula/ProjectionProfile.java b/src/main/java/technology/tabula/ProjectionProfile.java
index e6377665..39ab9e41 100644
--- a/src/main/java/technology/tabula/ProjectionProfile.java
+++ b/src/main/java/technology/tabula/ProjectionProfile.java
@@ -5,6 +5,8 @@
import java.util.List;
+// NOTE: this class is currently not used by the extraction algorithms
+// keeping it for potential use.
public class ProjectionProfile {
public static final int DECIMAL_PLACES = 1; // fixed <-> float conversion precision
@@ -71,7 +73,7 @@ public float[] getHorizontalProjection() {
public float[] findVerticalSeparators(float minColumnWidth) {
boolean foundNarrower = false;
- List verticalSeparators = new ArrayList();
+ List verticalSeparators = new ArrayList<>();
for (Ruling r: area.getVerticalRulings()) {
if (r.length() / this.textBounds.getHeight() >= 0.95) {
verticalSeparators.add(toFixed(r.getPosition() - this.areaLeft));
@@ -103,7 +105,7 @@ public float[] findVerticalSeparators(float minColumnWidth) {
public float[] findHorizontalSeparators(float minRowHeight) {
boolean foundShorter = false;
- List horizontalSeparators = new ArrayList();
+ List horizontalSeparators = new ArrayList<>();
for (Ruling r: area.getHorizontalRulings()) {
System.out.println(r.length() / this.textBounds.getWidth());
if (r.length() / this.textBounds.getWidth() >= 0.95) {
@@ -134,7 +136,7 @@ public float[] findHorizontalSeparators(float minRowHeight) {
}
private static List findSeparatorsFromProjection(float[] derivative) {
- List separators = new ArrayList();
+ List separators = new ArrayList<>();
Integer lastNeg = null;
float s;
boolean positiveSlope = false;
@@ -165,7 +167,7 @@ public static float[] smooth(float[] data, int kernelSize) {
+ kernelSize / 2, data.length); j++) {
s += data[j];
}
- rv[i] = (float) Math.floor(s / (float) kernelSize);
+ rv[i] = (float) Math.floor(s / kernelSize);
}
}
return rv;
@@ -174,7 +176,6 @@ public static float[] smooth(float[] data, int kernelSize) {
/**
* Simple Low pass filter
- * @return
*/
public static float[] filter(float[] data, float alpha) {
@@ -212,7 +213,7 @@ private static int toFixed(double value) {
}
private static double toDouble(int value) {
- return (double) value / Math.pow(10, DECIMAL_PLACES);
+ return value / Math.pow(10, DECIMAL_PLACES);
}
}
diff --git a/src/main/java/technology/tabula/QuickSort.java b/src/main/java/technology/tabula/QuickSort.java
index 9ead5e1f..03388a15 100644
--- a/src/main/java/technology/tabula/QuickSort.java
+++ b/src/main/java/technology/tabula/QuickSort.java
@@ -16,99 +16,97 @@
*/
package technology.tabula;
+import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
+import java.util.RandomAccess;
import java.util.Stack;
/**
- * see http://de.wikipedia.org/wiki/Quicksort.
+ * An implementation of Quicksort.
+ *
+ * @see wikipedia
*
* @author UWe Pachler
*/
-public class QuickSort
-{
-
- private QuickSort()
- {
- }
-
- private static final Comparator extends Comparable> objComp = new Comparator()
- {
- public int compare(Comparable object1, Comparable object2)
- {
- return object1.compareTo(object2);
- }
- };
+public final class QuickSort {
+
+ private QuickSort() {
+ // utility
+ }
+
+ /**
+ * Sorts the given list according to natural order.
+ */
+ public static > void sort(List list) {
+ sort(list, QuickSort.naturalOrder()); // JAVA_8 replace with Comparator.naturalOrder() (and cleanup)
+ }
+
+ /**
+ * Sorts the given list using the given comparator.
+ */
+ public static void sort(List list, Comparator super T> comparator) {
+ if (list instanceof RandomAccess) {
+ quicksort(list, comparator);
+ } else {
+ List copy = new ArrayList<>(list);
+ quicksort(copy, comparator);
+ list.clear();
+ list.addAll(copy);
+ }
+ }
- /**
- * Sorts the given list using the given comparator.
- *
- * @param list list to be sorted
- * @param cmp comparator used to compare the object swithin the list
- */
- public static void sort(List list, Comparator cmp)
- {
- quicksort(list, cmp);
- }
+ private static void quicksort(List list, Comparator super T> cmp) {
+ Stack stack = new Stack<>();
+ stack.push(0);
+ stack.push(list.size());
+ while (!stack.isEmpty()) {
+ int right = stack.pop();
+ int left = stack.pop();
+
+ if (right - left < 2) continue;
+ int p = left + ((right - left) / 2);
+ p = partition(list, cmp, p, left, right);
- /**
- * Sorts the given list using compareTo as comparator.
- *
- * @param list list to be sorted
- */
- public static void sort(List list)
- {
- sort(list, (Comparator) objComp);
- }
+ stack.push(p + 1);
+ stack.push(right);
- private static void quicksort(List list, Comparator cmp)
- {
- Stack stack = new Stack();
- stack.push(0);
- stack.push(list.size());
- while (!stack.isEmpty()) {
- int right = stack.pop();
- int left = stack.pop();
- if (right - left < 2) continue;
- int p = left + ((right-left)/2);
- p = partition(list, cmp, p, left, right);
-
- stack.push(p+1);
- stack.push(right);
+ stack.push(left);
+ stack.push(p);
+ }
+ }
- stack.push(left);
- stack.push(p);
+ private static int partition(List list, Comparator super T> cmp, int p, int start, int end) {
+ int l = start;
+ int h = end - 2;
+ T piv = list.get(p);
+ swap(list, p, end - 1);
- }
- }
-
- private static int partition(List list, Comparator cmp, int p, int start, int end) {
- int l = start;
- int h = end - 2;
- T piv = list.get(p);
- swap(list,p,end-1);
+ while (l < h) {
+ if (cmp.compare(list.get(l), piv) <= 0) l++;
+ else if (cmp.compare(piv, list.get(h)) <= 0) h--;
+ else swap(list, l, h);
+ }
+ int idx = h;
+ if (cmp.compare(list.get(h), piv) < 0) idx++;
+ swap(list, end - 1, idx);
+ return idx;
+ }
- while (l < h) {
- if (cmp.compare(list.get(l), piv) <= 0) {
- l++;
- } else if (cmp.compare(piv, list.get(h)) <= 0) {
- h--;
- } else {
- swap(list,l,h);
- }
- }
- int idx = h;
- if (cmp.compare(list.get(h), piv) < 0) idx++;
- swap(list,end-1,idx);
- return idx;
- }
-
+ private static void swap(List list, int i, int j) {
+ T tmp = list.get(i);
+ list.set(i, list.get(j));
+ list.set(j, tmp);
+ }
- private static void swap(List list, int i, int j)
- {
- T tmp = list.get(i);
- list.set(i, list.get(j));
- list.set(j, tmp);
- }
+ @SuppressWarnings({ "rawtypes", "unchecked" })
+ private static final Comparator NATURAL_ORDER = new Comparator() {
+ @Override public int compare(Object l, Object r) { return ((Comparable) l).compareTo(r); }
+ };
+
+ @SuppressWarnings("unchecked")
+ private static > Comparator naturalOrder() {
+ return NATURAL_ORDER;
+ }
}
diff --git a/src/main/java/technology/tabula/Rectangle.java b/src/main/java/technology/tabula/Rectangle.java
index 4dc75298..b96fcd77 100644
--- a/src/main/java/technology/tabula/Rectangle.java
+++ b/src/main/java/technology/tabula/Rectangle.java
@@ -2,167 +2,177 @@
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
+import java.util.Comparator;
import java.util.List;
+import java.util.Locale;
@SuppressWarnings("serial")
-public class Rectangle extends Rectangle2D.Float implements Comparable {
-
- private static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
-
- public Rectangle() {
- super();
- }
-
- public Rectangle(float top, float left, float width, float height) {
- super();
- this.setRect(left, top, width, height);
- }
-
- @Override
- public int compareTo(Rectangle other) {
- double thisBottom = this.getBottom();
- double otherBottom = other.getBottom();
- int rv;
-
- if (this.equals(other)) return 0;
-
- if (this.verticalOverlap(other) > VERTICAL_COMPARISON_THRESHOLD) {
- rv = java.lang.Double.compare(this.getX(), other.getX());
- }
- else {
- rv = java.lang.Double.compare(thisBottom, otherBottom);
- }
- return rv;
- }
-
-
-
- public float getArea() {
- return this.width * this.height;
- }
-
- public float verticalOverlap(Rectangle other) {
- return (float) Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
- }
-
- public boolean verticallyOverlaps(Rectangle other) {
- return verticalOverlap(other) > 0;
- }
-
- public float horizontalOverlap(Rectangle other) {
- return (float) Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
- }
-
- public boolean horizontallyOverlaps(Rectangle other) {
- return horizontalOverlap(other) > 0;
- }
-
- public float verticalOverlapRatio(Rectangle other) {
- float rv = 0,
- delta = (float) Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
-
- if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
- rv = (float) ((other.getBottom() - this.getTop()) / delta);
- }
- else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
- rv = (float) ((this.getBottom() - other.getTop()) / delta);
- }
- else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
- rv = (float) ((other.getBottom() - other.getTop()) / delta);
- }
- else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
- rv = (float) ((this.getBottom() - this.getTop()) / delta);
- }
-
- return rv;
-
- }
-
- public float overlapRatio(Rectangle other) {
- double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
- double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
- double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
- double unionArea = this.getArea() + other.getArea() - intersectionArea;
-
- return (float) (intersectionArea / unionArea);
- }
-
- public Rectangle merge(Rectangle other) {
- this.setRect(this.createUnion(other));
- return this;
- }
-
- public float getTop() {
- return (float) this.getMinY();
- }
-
- public void setTop(float top) {
- float deltaHeight = top - this.y;
- this.setRect(this.x, top, this.width, this.height - deltaHeight);
- }
-
- public float getRight() {
- return (float) this.getMaxX();
- }
-
- public void setRight(float right) {
- this.setRect(this.x, this.y, right - this.x, this.height);
- }
-
- public float getLeft() {
- return (float) this.getMinX();
- }
-
- public void setLeft(float left) {
- float deltaWidth = left - this.x;
- this.setRect(left, this.y, this.width - deltaWidth, this.height);
- }
-
- public float getBottom() {
- return (float) this.getMaxY();
- }
-
- public void setBottom(float bottom) {
- this.setRect(this.x, this.y, this.width, bottom - this.y);
- }
-
- public Point2D[] getPoints() {
- return new Point2D[] {
- new Point2D.Float((float) this.getLeft(), (float) this.getTop()),
- new Point2D.Float((float) this.getRight(), (float) this.getTop()),
- new Point2D.Float((float) this.getRight(), (float) this.getBottom()),
- new Point2D.Float((float) this.getLeft(), (float) this.getBottom())
- };
- }
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder();
- String s = super.toString();
- sb.append(s.substring(0, s.length() - 1));
- sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
- return sb.toString();
- }
-
-
- /**
- * @param rectangles
- * @return minimum bounding box that contains all the rectangles
- */
- public static Rectangle boundingBoxOf(List extends Rectangle> rectangles) {
- float minx = java.lang.Float.MAX_VALUE;
- float miny = java.lang.Float.MAX_VALUE;
- float maxx = java.lang.Float.MIN_VALUE;
- float maxy = java.lang.Float.MIN_VALUE;
-
- for (Rectangle r: rectangles) {
- minx = (float) Math.min(r.getMinX(), minx);
- miny = (float) Math.min(r.getMinY(), miny);
- maxx = (float) Math.max(r.getMaxX(), maxx);
- maxy = (float) Math.max(r.getMaxY(), maxy);
- }
- return new Rectangle(miny, minx, maxx - minx, maxy - miny);
- }
-
+public class Rectangle extends Rectangle2D.Float {
+
+ /**
+ * Ill-defined comparator, from when Rectangle was Comparable.
+ *
+ * @see PR 116
+ * @deprecated with no replacement
+ */
+ @Deprecated
+ public static final Comparator ILL_DEFINED_ORDER = new Comparator() {
+ @Override public int compare(Rectangle o1, Rectangle o2) {
+ if (o1.equals(o2)) return 0;
+ if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
+ return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1
+ ? - java.lang.Double.compare(o1.getX(), o2.getX())
+ : java.lang.Double.compare(o1.getX(), o2.getX());
+ } else {
+ return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
+ }
+ }
+ };
+
+ protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
+
+ public Rectangle() {
+ super();
+ }
+
+ public Rectangle(float top, float left, float width, float height) {
+ super();
+ this.setRect(left, top, width, height);
+ }
+
+ public int compareTo(Rectangle other) {
+ return ILL_DEFINED_ORDER.compare(this, other);
+ }
+
+ // I'm bad at Java and need this for fancy sorting in
+ // technology.tabula.TextChunk.
+ public int isLtrDominant() {
+ return 0;
+ }
+
+ public float getArea() {
+ return this.width * this.height;
+ }
+
+ public float verticalOverlap(Rectangle other) {
+ return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
+ }
+
+ public boolean verticallyOverlaps(Rectangle other) {
+ return verticalOverlap(other) > 0;
+ }
+
+ public float horizontalOverlap(Rectangle other) {
+ return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
+ }
+
+ public boolean horizontallyOverlaps(Rectangle other) {
+ return horizontalOverlap(other) > 0;
+ }
+
+ public float verticalOverlapRatio(Rectangle other) {
+ float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
+
+ if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom()
+ && other.getBottom() <= this.getBottom()) {
+ rv = (other.getBottom() - this.getTop()) / delta;
+ } else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom()
+ && this.getBottom() <= other.getBottom()) {
+ rv = (this.getBottom() - other.getTop()) / delta;
+ } else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom()
+ && other.getBottom() <= this.getBottom()) {
+ rv = (other.getBottom() - other.getTop()) / delta;
+ } else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom()
+ && this.getBottom() <= other.getBottom()) {
+ rv = (this.getBottom() - this.getTop()) / delta;
+ }
+
+ return rv;
+
+ }
+
+ public float overlapRatio(Rectangle other) {
+ double intersectionWidth = Math.max(0,
+ Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
+ double intersectionHeight = Math.max(0,
+ Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
+ double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
+ double unionArea = this.getArea() + other.getArea() - intersectionArea;
+
+ return (float) (intersectionArea / unionArea);
+ }
+
+ public Rectangle merge(Rectangle other) {
+ this.setRect(this.createUnion(other));
+ return this;
+ }
+
+ public float getTop() {
+ return (float) this.getMinY();
+ }
+
+ public void setTop(float top) {
+ float deltaHeight = top - this.y;
+ this.setRect(this.x, top, this.width, this.height - deltaHeight);
+ }
+
+ public float getRight() {
+ return (float) this.getMaxX();
+ }
+
+ public void setRight(float right) {
+ this.setRect(this.x, this.y, right - this.x, this.height);
+ }
+
+ public float getLeft() {
+ return (float) this.getMinX();
+ }
+
+ public void setLeft(float left) {
+ float deltaWidth = left - this.x;
+ this.setRect(left, this.y, this.width - deltaWidth, this.height);
+ }
+
+ public float getBottom() {
+ return (float) this.getMaxY();
+ }
+
+ public void setBottom(float bottom) {
+ this.setRect(this.x, this.y, this.width, bottom - this.y);
+ }
+
+ public Point2D[] getPoints() {
+ return new Point2D[] { new Point2D.Float(this.getLeft(), this.getTop()),
+ new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(), this.getBottom()),
+ new Point2D.Float(this.getLeft(), this.getBottom()) };
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ String s = super.toString();
+ sb.append(s.substring(0, s.length() - 1));
+ sb.append(String.format(Locale.US, ",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
+ return sb.toString();
+ }
+
+ /**
+ * @param rectangles
+ * @return minimum bounding box that contains all the rectangles
+ */
+ public static Rectangle boundingBoxOf(List extends Rectangle> rectangles) {
+ float minx = java.lang.Float.MAX_VALUE;
+ float miny = java.lang.Float.MAX_VALUE;
+ float maxx = java.lang.Float.MIN_VALUE;
+ float maxy = java.lang.Float.MIN_VALUE;
+
+ for (Rectangle r : rectangles) {
+ minx = (float) Math.min(r.getMinX(), minx);
+ miny = (float) Math.min(r.getMinY(), miny);
+ maxx = (float) Math.max(r.getMaxX(), maxx);
+ maxy = (float) Math.max(r.getMaxY(), maxy);
+ }
+ return new Rectangle(miny, minx, maxx - minx, maxy - miny);
+ }
}
diff --git a/src/main/java/technology/tabula/RectangleSpatialIndex.java b/src/main/java/technology/tabula/RectangleSpatialIndex.java
index e3aa633e..0e942545 100644
--- a/src/main/java/technology/tabula/RectangleSpatialIndex.java
+++ b/src/main/java/technology/tabula/RectangleSpatialIndex.java
@@ -1,88 +1,47 @@
package technology.tabula;
-import gnu.trove.procedure.TIntProcedure;
-
import java.util.ArrayList;
-import java.util.Collections;
import java.util.List;
-import net.sf.jsi.SpatialIndex;
-import net.sf.jsi.rtree.RTree;
+import org.locationtech.jts.geom.Envelope;
+import org.locationtech.jts.index.strtree.STRtree;
-class RectangleSpatialIndex {
+public class RectangleSpatialIndex {
- class SaveToListProcedure implements TIntProcedure {
- private List ids = new ArrayList();
- public boolean execute(int id) {
- ids.add(id);
- return true;
- };
-
- private List getIds() {
- return ids;
- }
- };
-
- private final SpatialIndex si;
- private final List rectangles;
- private Rectangle bounds = null;
-
- public RectangleSpatialIndex() {
- si = new RTree();
- si.init(null);
- rectangles = new ArrayList();
- }
-
+ private final STRtree si = new STRtree();
+ private final List rectangles = new ArrayList<>();
+
public void add(T te) {
rectangles.add(te);
- if (bounds == null) {
- bounds = new Rectangle();
- bounds.setRect(te);
- }
- else {
- bounds.merge(te);
- }
- si.add(rectangleToSpatialIndexRectangle(te), rectangles.size() - 1);
+ si.insert(new Envelope(te.getLeft(), te.getRight(), te.getBottom(), te.getTop()), te);
}
public List contains(Rectangle r) {
- SaveToListProcedure proc = new SaveToListProcedure();
- si.contains(rectangleToSpatialIndexRectangle(r), proc);
- ArrayList rv = new ArrayList();
- for (int i : proc.getIds()) {
- rv.add(rectangles.get(i));
+ List intersection = si.query(new Envelope(r.getLeft(), r.getRight(), r.getTop(), r.getBottom()));
+ List rv = new ArrayList();
+
+ for (T ir: intersection) {
+ if (r.contains(ir)) {
+ rv.add(ir);
+ }
}
- Utils.sort(rv);
+
+ Utils.sort(rv, Rectangle.ILL_DEFINED_ORDER);
return rv;
}
public List intersects(Rectangle r) {
- SaveToListProcedure proc = new SaveToListProcedure();
- si.intersects(rectangleToSpatialIndexRectangle(r), proc);
- ArrayList rv = new ArrayList();
- for (int i : proc.getIds()) {
- rv.add(rectangles.get(i));
- }
- Utils.sort(rv);
- return rv;
+ return si.query(new Envelope(r.getLeft(), r.getRight(), r.getTop(), r.getBottom()));
}
- private net.sf.jsi.Rectangle rectangleToSpatialIndexRectangle(Rectangle r) {
- return new net.sf.jsi.Rectangle((float) r.getX(),
- (float) r.getY(),
- (float) (r.getX() + r.getWidth()),
- (float) (r.getY() + r.getHeight()));
- }
-
-
/**
* Minimum bounding box of all the Rectangles contained on this RectangleSpatialIndex
*
* @return a Rectangle
*/
public Rectangle getBounds() {
- return bounds;
+ return Rectangle.boundingBoxOf(rectangles);
}
}
diff --git a/src/main/java/technology/tabula/RectangularTextContainer.java b/src/main/java/technology/tabula/RectangularTextContainer.java
index f9e0036f..934b5f13 100644
--- a/src/main/java/technology/tabula/RectangularTextContainer.java
+++ b/src/main/java/technology/tabula/RectangularTextContainer.java
@@ -1,35 +1,51 @@
package technology.tabula;
+import java.util.ArrayList;
import java.util.List;
@SuppressWarnings("serial")
-public abstract class RectangularTextContainer extends Rectangle {
-
- public RectangularTextContainer(float top, float left, float width, float height) {
- super(top, left, width, height);
- }
-
- public String toString() {
- StringBuilder sb = new StringBuilder();
- String s = super.toString();
- sb.append(s.substring(0, s.length() - 1));
- sb.append(String.format(",text=%s]", this.getText() == null ? "null" : "\"" + this.getText() + "\""));
- return sb.toString();
- }
-
- public RectangularTextContainer merge(RectangularTextContainer other) {
- if (this.compareTo(other) < 0) {
- this.getTextElements().addAll(other.getTextElements());
-
- }
- else {
- this.getTextElements().addAll(0, other.getTextElements());
- }
- super.merge(other);
- return this;
- }
-
- public abstract String getText();
- public abstract String getText(boolean useLineReturns);
- public abstract List getTextElements();
+public class RectangularTextContainer extends Rectangle implements HasText {
+
+ protected List textElements = new ArrayList<>();
+
+ protected RectangularTextContainer(float top, float left, float width, float height) {
+ super(top, left, width, height);
+ }
+
+ public RectangularTextContainer merge(RectangularTextContainer other) {
+ if (compareTo(other) < 0) {
+ this.getTextElements().addAll(other.getTextElements());
+ } else {
+ this.getTextElements().addAll(0, other.getTextElements());
+ }
+ super.merge(other);
+ return this;
+ }
+
+ public List getTextElements() {
+ return textElements;
+ }
+
+ public void setTextElements(List textElements) {
+ this.textElements = textElements;
+ }
+
+ @Override
+ public String getText() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public String getText(boolean useLineReturns) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override public String toString() {
+ StringBuilder sb = new StringBuilder();
+ String s = super.toString();
+ sb.append(s.substring(0, s.length() - 1));
+ sb.append(String.format(",text=%s]", this.getText() == null ? "null" : "\"" + this.getText() + "\""));
+ return sb.toString();
+ }
+
}
diff --git a/src/main/java/technology/tabula/Ruling.java b/src/main/java/technology/tabula/Ruling.java
index 1433878b..213ce87f 100644
--- a/src/main/java/technology/tabula/Ruling.java
+++ b/src/main/java/technology/tabula/Ruling.java
@@ -8,6 +8,7 @@
import java.util.Comparator;
import java.util.Formatter;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
@@ -16,7 +17,7 @@ public class Ruling extends Line2D.Float {
private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
private static int COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1;
- private enum SOType { VERTICAL, HRIGHT, HLEFT };
+ private enum SOType { VERTICAL, HRIGHT, HLEFT }
public Ruling(float top, float left, float width, float height) {
this(new Point2D.Float(left, top), new Point2D.Float(left+width, top+height));
@@ -24,8 +25,14 @@ public Ruling(float top, float left, float width, float height) {
public Ruling(Point2D p1, Point2D p2) {
super(p1, p2);
-
- // normalize almost vertical or almost horizontal lines
+ this.normalize();
+ }
+
+ /**
+ * Normalize almost horizontal or almost vertical lines
+ */
+ public void normalize() {
+
double angle = this.getAngle();
if (Utils.within(angle, 0, 1) || Utils.within(angle, 180, 1)) { // almost horizontal
this.setLine(this.x1, this.y1, this.x2, this.y1);
@@ -33,9 +40,6 @@ public Ruling(Point2D p1, Point2D p2) {
else if (Utils.within(angle, 90, 1) || Utils.within(angle, 270, 1)) { // almost vertical
this.setLine(this.x1, this.y1, this.x1, this.y2);
}
-// else {
-// System.out.println("oblique: " + this + " ("+ this.getAngle() + ")");
-// }
}
public boolean vertical() {
@@ -111,6 +115,20 @@ public void setEnd(float v) {
this.setRight(v);
}
}
+
+ private void setStartEnd(float start, float end) {
+ if (this.oblique()) {
+ throw new UnsupportedOperationException();
+ }
+ if (this.vertical()) {
+ this.setTop(start);
+ this.setBottom(end);
+ }
+ else {
+ this.setLeft(start);
+ this.setRight(end);
+ }
+ }
// -----
@@ -129,10 +147,14 @@ public boolean colinear(Point2D point) {
// because the expansions are additive
// (e.g. two vertical lines, at x = 100, with one having y2 of 98 and the other having y1 of 102 would
// erroneously be said to nearlyIntersect if they were each expanded by 2 (since they'd both terminate at 100).
- // The COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT is only 1 so the total expansion is 2.
+ // By default the COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT is only 1 so the total expansion is 2.
// A total expansion amount of 2 is empirically verified to work sometimes. It's not a magic number from any
// source other than a little bit of experience.)
public boolean nearlyIntersects(Ruling another) {
+ return this.nearlyIntersects(another, COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT);
+ }
+
+ public boolean nearlyIntersects(Ruling another, int colinearOrParallelExpandAmount) {
if (this.intersectsLine(another)) {
return true;
}
@@ -143,8 +165,8 @@ public boolean nearlyIntersects(Ruling another) {
rv = this.expand(PERPENDICULAR_PIXEL_EXPAND_AMOUNT).intersectsLine(another);
}
else {
- rv = this.expand(COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT)
- .intersectsLine(another.expand(COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT));
+ rv = this.expand(colinearOrParallelExpandAmount)
+ .intersectsLine(another.expand(colinearOrParallelExpandAmount));
}
return rv;
@@ -206,11 +228,6 @@ public boolean equals(Object other) {
return this.getP1().equals(o.getP1()) && this.getP2().equals(o.getP2());
}
- @Override
- public int hashCode() {
- return super.hashCode();
- }
-
public float getTop() {
return this.y1;
}
@@ -267,13 +284,13 @@ public double getAngle() {
public String toString() {
StringBuilder sb = new StringBuilder();
Formatter formatter = new Formatter(sb);
- String rv = formatter.format("%s[x1=%f y1=%f x2=%f y2=%f]", this.getClass().toString(), this.x1, this.y1, this.x2, this.y2).toString();
+ String rv = formatter.format(Locale.US, "%s[x1=%f y1=%f x2=%f y2=%f]", this.getClass().toString(), this.x1, this.y1, this.x2, this.y2).toString();
formatter.close();
return rv;
}
public static List cropRulingsToArea(List rulings, Rectangle2D area) {
- ArrayList rv = new ArrayList();
+ ArrayList rv = new ArrayList<>();
for (Ruling r : rulings) {
if (r.intersects(area)) {
rv.add(r.intersect(area));
@@ -298,15 +315,15 @@ public SortObject(SOType type, float position, Ruling ruling) {
}
}
- List sos = new ArrayList();
+ List sos = new ArrayList<>();
- TreeMap tree = new TreeMap(new Comparator() {
+ TreeMap tree = new TreeMap<>(new Comparator() {
@Override
public int compare(Ruling o1, Ruling o2) {
return java.lang.Double.compare(o1.getTop(), o2.getTop());
}});
- TreeMap rv = new TreeMap(new Comparator() {
+ TreeMap rv = new TreeMap<>(new Comparator() {
@Override
public int compare(Point2D o1, Point2D o2) {
if (o1.getY() > o2.getY()) return 1;
@@ -379,26 +396,39 @@ else if (a.type == SOType.HRIGHT && b.type == SOType.VERTICAL) {
return rv;
}
-
+
public static List collapseOrientedRulings(List lines) {
- ArrayList rv = new ArrayList();
- if (lines.size() == 0) {
- return rv;
- }
+ return collapseOrientedRulings(lines, COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT);
+ }
+
+ public static List collapseOrientedRulings(List lines, int expandAmount) {
+ ArrayList