diff --git a/.travis.yml b/.travis.yml
index 2a5ffc72..7397abbf 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,10 +1,9 @@
language: java
-script: mvn test
+install: mvn install -DskipTests=true -Dmaven.javadoc.skip=true -Dgpg.skip=true -B -V
+script: mvn test -Dgpg.skip=true
jdk:
- - oraclejdk7
- - openjdk7
- - oraclejdk8
+ - openjdk8
+ - openjdk9
+ - openjdk10
+ - openjdk11
sudo: false
-
-
-
diff --git a/LICENSE b/LICENSE
index 06bdd025..4beb04ee 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
The MIT License (MIT)
-Copyright (c) 2014 Manuel Aristarán
+Copyright (c) 2014-2016 Manuel Aristarán
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
@@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
\ No newline at end of file
+SOFTWARE.
diff --git a/README.md b/README.md
index 55cb02e1..2a08d3ac 100644
--- a/README.md
+++ b/README.md
@@ -1,66 +1,116 @@
-tabula-java [](https://travis-ci.org/tabulapdf/tabula-java) [](https://gitter.im/tabulapdf/tabula-java?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+tabula-java [](https://travis-ci.org/tabulapdf/tabula-java) [](https://ci.appveyor.com/project/jazzido/tabula-java)
===========
-`tabula-java` is a library for extracting tables from PDF files. It is a Java rewrite of [`tabula-extractor`](http://github.com/tabulapdf/tabula-extractor), that is a thin wrapper around this library.
+`tabula-java` is a library for extracting tables from PDF files — it is the table extraction engine that powers [Tabula](http://tabula.technology/) ([repo](http://github.com/tabulapdf/tabula)). You can use `tabula-java` as a command-line tool to programmatically extract tables from PDFs.
+
+© 2014-2020 Manuel Aristarán. Available under MIT License. See [`LICENSE`](LICENSE).
## Download
Download a version of the tabula-java's jar, with all dependencies included, that works on Mac, Windows and Linux from our [releases page](../../releases).
-## Build instructions
-
-Clone this repo and run:
-
-```
-mvn clean compile assembly:single
-```
-
-## Examples
+## Usage Examples
`tabula-java` provides a command line application:
```
-$ java -jar ./target/tabula-0.8.0-jar-with-dependencies.jar --help
-
-usage: tabula [-a ] [-c ] [-d] [-f ] [-g] [-h] [-i]
- [-n] [-o ] [-p ] [-r] [-s ] [-u] [-v]
+$ java -jar target/tabula-1.0.2-jar-with-dependencies.jar --help
+usage: tabula [-a ] [-b ] [-c ] [-f ]
+ [-g] [-h] [-i] [-l] [-n] [-o ] [-p ] [-r] [-s
+ ] [-t] [-u] [-v]
Tabula helps you extract tables from PDFs
- -a,--area Portion of the page to analyze
- (top,left,bottom,right). Example: --area
- 269.875,12.75,790.5,561. Default is entire
- page
+
+ -a,--area -a/--area = Portion of the page to analyze.
+ Example: --area 269.875,12.75,790.5,561.
+ Accepts top,left,bottom,right i.e. y1,x1,y2,x2
+ where all values are in points relative to the
+ top left corner. If all values are between
+ 0-100 (inclusive) and preceded by '%', input
+ will be taken as % of actual height or width
+ of the page. Example: --area %0,0,100,50. To
+ specify multiple areas, -a option should be
+ repeated. Default is entire page
+ -b,--batch Convert all .pdfs in the provided directory.
-c,--columns X coordinates of column boundaries. Example
- --columns 10.1,20.2,30.3
- -d,--debug Print detected table areas instead of
- processing.
+ --columns 10.1,20.2,30.3. If all values are
+ between 0-100 (inclusive) and preceded by '%',
+ input will be taken as % of actual width of
+ the page. Example: --columns %25,50,80.6
-f,--format Output format: (CSV,TSV,JSON). Default: CSV
-g,--guess Guess the portion of the page to analyze per
page.
-h,--help Print this help text.
-i,--silent Suppress all stderr output.
- -n,--no-spreadsheet Force PDF not to be extracted using
- spreadsheet-style extraction (if there are
- ruling lines separating each cell, as in a PDF
- of an Excel spreadsheet)
+ -l,--lattice Force PDF to be extracted using lattice-mode
+ extraction (if there are ruling lines
+ separating each cell, as in a PDF of an Excel
+ spreadsheet)
+ -n,--no-spreadsheet [Deprecated in favor of -t/--stream] Force PDF
+ not to be extracted using spreadsheet-style
+ extraction (if there are no ruling lines
+ separating each cell)
-o,--outfile Write output to instead of STDOUT.
Default: -
-p,--pages Comma separated list of ranges, or all.
Examples: --pages 1-3,5-7, --pages 3 or
--pages all. Default is --pages 1
- -r,--spreadsheet Force PDF to be extracted using
- spreadsheet-style extraction (if there are
- ruling lines separating each cell, as in a PDF
- of an Excel spreadsheet)
+ -r,--spreadsheet [Deprecated in favor of -l/--lattice] Force
+ PDF to be extracted using spreadsheet-style
+ extraction (if there are ruling lines
+ separating each cell, as in a PDF of an Excel
+ spreadsheet)
-s,--password Password to decrypt document. Default is empty
+ -t,--stream Force PDF to be extracted using stream-mode
+ extraction (if there are no ruling lines
+ separating each cell)
-u,--use-line-returns Use embedded line returns in cells. (Only in
spreadsheet mode.)
-v,--version Print version and exit.
-
```
-It also includes a debugging tool, run `java -cp ./target/tabula-0.8.0-jar-with-dependencies.jar technology.tabula.debug.Debug -h` for the available options.
+It also includes a debugging tool, run `java -cp ./target/tabula-1.0.2-jar-with-dependencies.jar technology.tabula.debug.Debug -h` for the available options.
You can also integrate `tabula-java` with any JVM language. For Java examples, see the [`tests`](src/test/java/technology/tabula/) folder.
-© 2014 Manuel Aristarán. Available under MIT License. See [`LICENSE`](LICENSE).
+JVM start-up time is a lot of the cost of the `tabula` command, so if you're trying to extract many tables from PDFs, you have a few options for speeding it up:
+
+ - the [drip](https://github.com/ninjudd/drip) utility
+ - the [Ruby](http://github.com/tabulapdf/tabula-extractor), [Python](https://github.com/chezou/tabula-py), [R](https://github.com/leeper/tabulizer), and [Node.js](https://github.com/ezodude/tabula-js) bindings
+ - writing your own program in any JVM language (Java, JRuby, Scala) that imports tabula-java.
+ - waiting for us to implement an API/server-style system (it's on the [roadmap](https://github.com/tabulapdf/tabula-api))
+
+## Building from Source
+
+Clone this repo and run:
+
+```
+mvn clean compile assembly:single
+```
+
+## Contributing
+
+Interested in helping out? We'd love to have your help!
+
+You can help by:
+
+- [Reporting a bug](https://github.com/tabulapdf/tabula-java/issues).
+- Adding or editing documentation.
+- Contributing code via a Pull Request.
+- Spreading the word about `tabula-java` to people who might be able to benefit from using it.
+
+### Backers
+
+You can also support our continued work on `tabula-java` with a one-time or monthly donation [on OpenCollective](https://opencollective.com/tabulapdf#support). Organizations who use `tabula-java` can also [sponsor the project](https://opencollective.com/tabulapdf#support) for acknowledgement on [our official site](http://tabula.technology/) and this README.
+
+Special thanks to the following users and organizations for generously supporting Tabula with donations and grants:
+
+
+
+
+
+
+
+
+
+
diff --git a/appveyor.yml b/appveyor.yml
new file mode 100644
index 00000000..b2c4a0ae
--- /dev/null
+++ b/appveyor.yml
@@ -0,0 +1,21 @@
+version: '{build}'
+install:
+ - ps: |
+ Add-Type -AssemblyName System.IO.Compression.FileSystem
+ if (!(Test-Path -Path "C:\maven\apache-maven-3.5.4" )) {
+ (new-object System.Net.WebClient).DownloadFile(
+ 'http://www-us.apache.org/dist/maven/maven-3/3.5.4/binaries/apache-maven-3.5.4-bin.zip',
+ 'C:\maven-bin.zip'
+ )
+ [System.IO.Compression.ZipFile]::ExtractToDirectory("C:\maven-bin.zip", "C:\maven")
+ }
+ - cmd: SET PATH=C:\maven\apache-maven-3.5.4\bin;%JAVA_HOME%\bin;%PATH%
+ - cmd: SET MAVEN_OPTS=-Xmx2g
+ - cmd: SET JAVA_OPTS=-Xmx2g
+build_script:
+ - mvn clean package -B -DskipTests -Dmaven.javadoc.skip=true
+test_script:
+ - mvn install -B -Dmaven.javadoc.skip=true -Dgpg.skip
+cache:
+ - C:\maven -> appveyor.yml
+ - C:\Users\appveyor\.m2 -> appveyor.yml
diff --git a/pom.xml b/pom.xml
index e7ac6111..0f53c052 100644
--- a/pom.xml
+++ b/pom.xml
@@ -2,7 +2,7 @@
4.0.0
technology.tabula
tabula
- 0.9.0
+ 1.0.5-SNAPSHOT
Tabula
Extract tables from PDF files
http://github.com/tabulapdf/tabula-java
@@ -32,21 +32,26 @@
+
+
+ snapshots
+ https://repository.apache.org/content/repositories/snapshots/
+
+ false
+
+
+ true
+
+
+
+
scm:git:git@github.com:tabulapdf/tabula-java.git
scm:git:git@github.com:tabulapdf/tabula-java.git
git@github.com:tabulapdf/tabula-java.git
- tabula-0.9.0
+ v1.0.2
-
-
- sonatype
- Sonatype repository
- https://oss.sonatype.org/content/repositories/snapshots/
-
-
-
UTF-8
UTF-8
@@ -58,17 +63,16 @@
https://oss.sonatype.org/content/repositories/snapshots
- ossrh
- https://oss.sonatype.org/service/local/staging/deploy/maven2/
-
+ ossrh
+ https://oss.sonatype.org/service/local/staging/deploy/maven2/
+
-
org.apache.maven.plugins
maven-javadoc-plugin
- 2.10.3
+ 3.2.0
true
@@ -81,7 +85,7 @@
org.sonatype.plugins
nexus-staging-maven-plugin
- 1.6.3
+ 1.6.8
true
ossrh
@@ -93,7 +97,7 @@
org.apache.maven.plugins
maven-source-plugin
- 2.2.1
+ 3.2.1
attach-sources
@@ -103,23 +107,26 @@
-
- org.apache.maven.plugins
- maven-javadoc-plugin
- 2.9.1
-
-
- attach-javadocs
-
- jar
-
-
-
+
maven-compiler-plugin
- 3.1
+ 3.8.1
- 1.6
- 1.6
+ 1.8
+ 1.8
@@ -149,6 +162,25 @@
jar-with-dependencies
+
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+ 2.22.2
+
+
+ -Xms1024m -Xmx2048m
+
+
+
+
+ org.apache.maven.plugins
+ maven-eclipse-plugin
+ 2.10
+
+ true
+ true
@@ -158,11 +190,14 @@
release
-
+
org.apache.maven.plugins
maven-javadoc-plugin
- 2.9.1
+ 3.2.0
+
+ 8
+
attach-javadocs
@@ -175,7 +210,7 @@
org.apache.maven.plugins
maven-source-plugin
- 2.2.1
+ 3.2.1
attach-sources
@@ -188,7 +223,7 @@
org.apache.maven.plugins
maven-gpg-plugin
- 1.5
+ 1.6
sign-artifacts
@@ -206,64 +241,70 @@
- net.sf.jsi
- jsi
- 1.1.0-SNAPSHOT
+ org.locationtech.jts
+ jts-core
+ 1.17.0
org.slf4j
slf4j-api
- 1.7.20
+ 1.7.30
org.slf4j
slf4j-simple
- 1.7.20
+ 1.7.30
org.apache.pdfbox
pdfbox
- 1.8.10
-
-
-
- org.bouncycastle
- bcprov-jdk15
- 1.46
-
-
-
- org.bouncycastle
- bcmail-jdk15
- 1.46
+ 2.0.21
junit
junit
- 4.11
+ 4.13
test
commons-cli
commons-cli
- 1.3.1
+ 1.4
org.apache.commons
commons-csv
- 1.2
+ 1.8
com.google.code.gson
gson
- 2.6.2
+ 2.8.6
+
+
+
+ com.github.jai-imageio
+ jai-imageio-core
+ 1.4.0
+
+
+
+ com.github.jai-imageio
+ jai-imageio-jpeg2000
+ 1.3.0
+
+
+
+ org.apache.pdfbox
+ jbig2-imageio
+ 3.0.3
diff --git a/src/main/java/technology/tabula/Cell.java b/src/main/java/technology/tabula/Cell.java
index b7e568db..5757e729 100644
--- a/src/main/java/technology/tabula/Cell.java
+++ b/src/main/java/technology/tabula/Cell.java
@@ -7,69 +7,70 @@
@SuppressWarnings("serial")
public class Cell extends RectangularTextContainer {
- private boolean spanning;
- private boolean placeholder;
- private List textElements;
-
- public Cell(float top, float left, float width, float height) {
- super(top, left, width, height);
- this.setPlaceholder(false);
- this.setSpanning(false);
- this.setTextElements(new ArrayList());
- }
-
- public Cell(Point2D topLeft, Point2D bottomRight) {
- super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
- this.setPlaceholder(false);
- this.setSpanning(false);
- this.setTextElements(new ArrayList());
- }
-
- @Override
- public String getText(boolean useLineReturns) {
- if (this.textElements.size() == 0) {
- return "";
- }
- StringBuilder sb = new StringBuilder();
- Collections.sort(this.textElements);
- double curTop = this.textElements.get(0).getTop();
- for (TextChunk tc: this.textElements) {
- if (useLineReturns && tc.getTop() > curTop) {
- sb.append('\r');
- }
- sb.append(tc.getText());
- curTop = tc.getTop();
- }
- return sb.toString().trim();
- }
+ public static char CELL_DIVIDER = '\r';
- public String getText() {
- return getText(true);
- }
+ public Cell(float top, float left, float width, float height) {
+ super(top, left, width, height);
+ this.setPlaceholder(false);
+ this.setSpanning(false);
+ this.setTextElements(new ArrayList());
+ }
- public boolean isSpanning() {
- return spanning;
- }
+ public Cell(Point2D topLeft, Point2D bottomRight) {
+ super((float) topLeft.getY(), (float) topLeft.getX(), (float) (bottomRight.getX() - topLeft.getX()), (float) (bottomRight.getY() - topLeft.getY()));
+ this.setPlaceholder(false);
+ this.setSpanning(false);
+ this.setTextElements(new ArrayList());
+ }
- public void setSpanning(boolean spanning) {
- this.spanning = spanning;
- }
+ private boolean spanning;
+ private boolean placeholder;
+ private List textElements;
- public boolean isPlaceholder() {
- return placeholder;
- }
+ @Override
+ public String getText(boolean useLineReturns) {
+ if (this.textElements.size() == 0) {
+ return "";
+ }
+ StringBuilder sb = new StringBuilder();
+ Collections.sort(this.textElements, Rectangle.ILL_DEFINED_ORDER);
+ double curTop = this.textElements.get(0).getTop();
+ for (TextChunk tc : this.textElements) {
+ if (useLineReturns && tc.getTop() > curTop) {
+ sb.append(CELL_DIVIDER);
+ }
+ sb.append(tc.getText());
+ curTop = tc.getTop();
+ }
+ return sb.toString().trim();
+ }
- public void setPlaceholder(boolean placeholder) {
- this.placeholder = placeholder;
- }
+ public String getText() {
+ return getText(true);
+ }
+ public boolean isSpanning() {
+ return spanning;
+ }
- public List getTextElements() {
- return textElements;
- }
+ public void setSpanning(boolean spanning) {
+ this.spanning = spanning;
+ }
- public void setTextElements(List textElements) {
- this.textElements = textElements;
- }
+ public boolean isPlaceholder() {
+ return placeholder;
+ }
+
+ public void setPlaceholder(boolean placeholder) {
+ this.placeholder = placeholder;
+ }
+
+ public List getTextElements() {
+ return textElements;
+ }
+
+ public void setTextElements(List textElements) {
+ this.textElements = textElements;
+ }
}
diff --git a/src/main/java/technology/tabula/CommandLineApp.java b/src/main/java/technology/tabula/CommandLineApp.java
index 7ec2d9d0..e940955d 100644
--- a/src/main/java/technology/tabula/CommandLineApp.java
+++ b/src/main/java/technology/tabula/CommandLineApp.java
@@ -1,26 +1,25 @@
package technology.tabula;
-import java.awt.geom.Point2D;
import java.io.BufferedWriter;
import java.io.File;
+import java.io.FilenameFilter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
-import java.util.Iterator;
+import java.util.Collections;
import java.util.List;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.HelpFormatter;
-import org.apache.commons.cli.OptionBuilder;
+import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
-import org.apache.commons.cli.GnuParser;
+import org.apache.commons.cli.DefaultParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import technology.tabula.detectors.DetectionAlgorithm;
import technology.tabula.detectors.NurminenDetectionAlgorithm;
-import technology.tabula.detectors.SpreadsheetDetectionAlgorithm;
import technology.tabula.extractors.BasicExtractionAlgorithm;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
import technology.tabula.writers.CSVWriter;
@@ -31,17 +30,45 @@
public class CommandLineApp {
- private static String VERSION = "0.9.0";
- private static String VERSION_STRING = String.format("tabula %s (c) 2012-2016 Manuel Aristarán", VERSION);
+ private static String VERSION = "1.0.5";
+ private static String VERSION_STRING = String.format("tabula %s (c) 2012-2020 Manuel Aristarán", VERSION);
private static String BANNER = "\nTabula helps you extract tables from PDFs\n\n";
+ private static final int RELATIVE_AREA_CALCULATION_MODE = 0;
+ private static final int ABSOLUTE_AREA_CALCULATION_MODE = 1;
+
+
private Appendable defaultOutput;
+ private List> pageAreas;
+ private List pages;
+ private OutputFormat outputFormat;
+ private String password;
+ private TableExtractor tableExtractor;
+ private List verticalRulingPositions;
+
+ public CommandLineApp(Appendable defaultOutput, CommandLine line) throws ParseException {
+ this.defaultOutput = defaultOutput;
+ this.pageAreas = CommandLineApp.whichAreas(line);
+ this.pages = CommandLineApp.whichPages(line);
+ this.outputFormat = CommandLineApp.whichOutputFormat(line);
+ this.tableExtractor = CommandLineApp.createExtractor(line);
+
+ if (line.hasOption('s')) {
+ this.password = line.getOptionValue('s');
+ }
+ if (line.hasOption('c')) {
+ this.verticalRulingPositions = parseFloatList(line.getOptionValue('c'));
+ }
+
+ }
+
public static void main(String[] args) {
- CommandLineParser parser = new GnuParser();
+ Cell.CELL_DIVIDER = ' '; // terminal does not like \r
+ CommandLineParser parser = new DefaultParser();
try {
// parse the command line arguments
- CommandLine line = parser.parse(buildOptions(), args );
+ CommandLine line = parser.parse(buildOptions(), args);
if (line.hasOption('h')) {
printHelp();
@@ -53,168 +80,217 @@ public static void main(String[] args) {
System.exit(0);
}
- if (line.getArgs().length != 1) {
- throw new ParseException("Need one filename\nTry --help for help");
- }
-
- new CommandLineApp(System.out).extractTables(line);
-
- }
- catch( ParseException exp ) {
+ new CommandLineApp(System.out, line).extractTables(line);
+ } catch (ParseException exp) {
System.err.println("Error: " + exp.getMessage());
System.exit(1);
}
System.exit(0);
}
- public CommandLineApp(Appendable defaultOutput) {
- this.defaultOutput = defaultOutput;
- }
-
public void extractTables(CommandLine line) throws ParseException {
- File pdfFile = new File(line.getArgs()[0]);
- if (!pdfFile.exists()) {
- throw new ParseException("File does not exist");
- }
-
- OutputFormat of = OutputFormat.CSV;
- if (line.hasOption('f')) {
- try {
- of = OutputFormat.valueOf(line.getOptionValue('f'));
- }
- catch (IllegalArgumentException e) {
- throw new ParseException(String.format(
- "format %s is illegal. Available formats: %s",
- line.getOptionValue('f'),
- Utils.join(",", OutputFormat.formatNames())));
+ if (line.hasOption('m')) {
+ if (line.getArgs().length != 0) {
+ throw new ParseException("Filename specified with batch\nTry --help for help");
}
+ File pdfDirectory = new File(line.getOptionValue('m'));
+ if (!pdfDirectory.isDirectory()) {
+ throw new ParseException("Directory does not exist or is not a directory");
+ }
+ extractDirectoryTables(line, pdfDirectory);
+ return;
}
- Appendable outFile = this.defaultOutput;
- if (line.hasOption('o')) {
- File file = new File(line.getOptionValue('o'));
+ if (line.getArgs().length != 1) {
+ throw new ParseException("Need exactly one filename\nTry --help for help");
+ }
- try {
- file.createNewFile();
- outFile = new BufferedWriter(new FileWriter(
- file.getAbsoluteFile()));
- } catch (IOException e) {
- throw new ParseException("Cannot create file "
- + line.getOptionValue('o'));
- }
+ File pdfFile = new File(line.getArgs()[0]);
+ if (!pdfFile.exists()) {
+ throw new ParseException("File does not exist");
}
+ extractFileTables(line, pdfFile);
+ }
- Rectangle area = null;
- if (line.hasOption('a')) {
- List f = parseFloatList(line.getOptionValue('a'));
- if (f.size() != 4) {
- throw new ParseException("area parameters must be top,left,bottom,right");
+ public void extractDirectoryTables(CommandLine line, File pdfDirectory) throws ParseException {
+ File[] pdfs = pdfDirectory.listFiles(new FilenameFilter() {
+ public boolean accept(File dir, String name) {
+ return name.endsWith(".pdf");
}
- area = new Rectangle(f.get(0), f.get(1), f.get(3) - f.get(1), f.get(2) - f.get(0));
+ });
+
+ for (File pdfFile : pdfs) {
+ File outputFile = new File(getOutputFilename(pdfFile));
+ extractFileInto(pdfFile, outputFile);
}
+ }
- List verticalRulingPositions = null;
- if (line.hasOption('c')) {
- verticalRulingPositions = parseFloatList(line.getOptionValue('c'));
+ public void extractFileTables(CommandLine line, File pdfFile) throws ParseException {
+ if (!line.hasOption('o')) {
+ extractFile(pdfFile, this.defaultOutput);
+ return;
}
- String pagesOption = line.hasOption('p') ? line.getOptionValue('p') : "1";
- List pages = Utils.parsePagesOption(pagesOption);
- ExtractionMethod method = whichExtractionMethod(line);
- boolean useLineReturns = line.hasOption('u');
+ File outputFile = new File(line.getOptionValue('o'));
+ extractFileInto(pdfFile, outputFile);
+ }
+ public void extractFileInto(File pdfFile, File outputFile) throws ParseException {
+ BufferedWriter bufferedWriter = null;
try {
+ FileWriter fileWriter = new FileWriter(outputFile.getAbsoluteFile());
+ bufferedWriter = new BufferedWriter(fileWriter);
- PDDocument pdfDocument = PDDocument.load(pdfFile);
-
- ObjectExtractor oe = line.hasOption('s') ?
- new ObjectExtractor(pdfDocument, line.getOptionValue('s')) :
- new ObjectExtractor(pdfDocument);
- BasicExtractionAlgorithm basicExtractor = new BasicExtractionAlgorithm();
- SpreadsheetExtractionAlgorithm spreadsheetExtractor = new SpreadsheetExtractionAlgorithm();
+ outputFile.createNewFile();
+ extractFile(pdfFile, bufferedWriter);
+ } catch (IOException e) {
+ throw new ParseException("Cannot create file " + outputFile);
+ } finally {
+ if (bufferedWriter != null) {
+ try {
+ bufferedWriter.close();
+ } catch (IOException e) {
+ System.out.println("Error in closing the BufferedWriter" + e);
+ }
+ }
+ }
+ }
- PageIterator pageIterator = pages == null ? oe.extract() : oe.extract(pages);
- Page page;
- List tables = new ArrayList();
+ private void extractFile(File pdfFile, Appendable outFile) throws ParseException {
+ PDDocument pdfDocument = null;
+ try {
+ pdfDocument = this.password == null ? PDDocument.load(pdfFile) : PDDocument.load(pdfFile, this.password);
+ PageIterator pageIterator = getPageIterator(pdfDocument);
+ List tables = new ArrayList<>();
while (pageIterator.hasNext()) {
- page = pageIterator.next();
-
- if (area != null) {
- page = page.getArea(area);
+ Page page = pageIterator.next();
+ if (verticalRulingPositions != null) {
+ for (Float verticalRulingPosition: verticalRulingPositions) {
+ page.addRuling(new Ruling(0, verticalRulingPosition, 0.0f, (float) page.getHeight()));
+ }
}
- if (method == ExtractionMethod.DECIDE) {
- method = spreadsheetExtractor.isTabular(page) ? ExtractionMethod.SPREADSHEET : ExtractionMethod.BASIC;
- }
-
- switch(method) {
- case BASIC:
- if (line.hasOption('g')) {
- // guess the page areas to extract using a detection algorithm
- // currently we only have a detector that uses spreadsheets to find table areas
- DetectionAlgorithm detector = new NurminenDetectionAlgorithm();
- List guesses = detector.detect(page);
-
- for (Rectangle guessRect : guesses) {
- Page guess = page.getArea(guessRect);
- tables.addAll(basicExtractor.extract(guess));
+ if (pageAreas != null) {
+ for (Pair areaPair : pageAreas) {
+ Rectangle area = areaPair.getRight();
+ if (areaPair.getLeft() == RELATIVE_AREA_CALCULATION_MODE) {
+ area = new Rectangle((float) (area.getTop() / 100 * page.getHeight()),
+ (float) (area.getLeft() / 100 * page.getWidth()), (float) (area.getWidth() / 100 * page.getWidth()),
+ (float) (area.getHeight() / 100 * page.getHeight()));
}
- } else {
- tables.addAll(verticalRulingPositions == null ? basicExtractor.extract(page) : basicExtractor.extract(page, verticalRulingPositions));
+ tables.addAll(tableExtractor.extractTables(page.getArea(area)));
}
-
- break;
- case SPREADSHEET:
- // TODO add useLineReturns
- tables.addAll(spreadsheetExtractor.extract(page));
- default:
- break;
+ } else {
+ tables.addAll(tableExtractor.extractTables(page));
}
}
- writeTables(of, tables, outFile);
-
-
+ writeTables(tables, outFile);
} catch (IOException e) {
throw new ParseException(e.getMessage());
+ } finally {
+ try {
+ if (pdfDocument != null) {
+ pdfDocument.close();
+ }
+ } catch (IOException e) {
+ System.out.println("Error in closing pdf document" + e);
+ }
}
+ }
+ private PageIterator getPageIterator(PDDocument pdfDocument) throws IOException {
+ ObjectExtractor extractor = new ObjectExtractor(pdfDocument);
+ return (pages == null) ?
+ extractor.extract() :
+ extractor.extract(pages);
}
- private void writeTables(OutputFormat format, List tables, Appendable out) throws IOException {
- Writer writer = null;
- switch (format) {
- case CSV:
- writer = new CSVWriter();
- break;
- case JSON:
- writer = new JSONWriter();
- break;
- case TSV:
- writer = new TSVWriter();
- break;
+ // CommandLine parsing methods
+
+ private static OutputFormat whichOutputFormat(CommandLine line) throws ParseException {
+ if (!line.hasOption('f')) {
+ return OutputFormat.CSV;
+ }
+
+ try {
+ return OutputFormat.valueOf(line.getOptionValue('f'));
+ } catch (IllegalArgumentException e) {
+ throw new ParseException(String.format(
+ "format %s is illegal. Available formats: %s",
+ line.getOptionValue('f'),
+ Utils.join(",", OutputFormat.formatNames())));
}
- writer.write(out, tables);
}
- private ExtractionMethod whichExtractionMethod(CommandLine line) {
- ExtractionMethod rv = ExtractionMethod.DECIDE;
- if (line.hasOption('r')) {
- rv = ExtractionMethod.SPREADSHEET;
+ private static List> whichAreas(CommandLine line) throws ParseException {
+ if (!line.hasOption('a')) {
+ return null;
}
- else if (line.hasOption('n') || line.hasOption('c') || line.hasOption('g')) {
- rv = ExtractionMethod.BASIC;
+
+ String[] optionValues = line.getOptionValues('a');
+
+ List> areaList = new ArrayList>();
+ for (String optionValue: optionValues) {
+ int areaCalculationMode = ABSOLUTE_AREA_CALCULATION_MODE;
+ int startIndex = 0;
+ if (optionValue.startsWith("%")) {
+ startIndex = 1;
+ areaCalculationMode = RELATIVE_AREA_CALCULATION_MODE;
+ }
+ List f = parseFloatList(optionValue.substring(startIndex));
+ if (f.size() != 4) {
+ throw new ParseException("area parameters must be top,left,bottom,right optionally preceded by %");
+ }
+ areaList.add(new Pair(areaCalculationMode, new Rectangle(f.get(0), f.get(1), f.get(3) - f.get(1), f.get(2) - f.get(0))));
}
- return rv;
+ return areaList;
}
+ private static List whichPages(CommandLine line) throws ParseException {
+ String pagesOption = line.hasOption('p') ? line.getOptionValue('p') : "1";
+ return Utils.parsePagesOption(pagesOption);
+ }
+ private static ExtractionMethod whichExtractionMethod(CommandLine line) {
+ // -r/--spreadsheet [deprecated; use -l] or -l/--lattice
+ if (line.hasOption('r') || line.hasOption('l')) {
+ return ExtractionMethod.SPREADSHEET;
+ }
+
+ // -n/--no-spreadsheet [deprecated; use -t] or -c/--columns or -g/--guess or -t/--stream
+ if (line.hasOption('n') || line.hasOption('c') || line.hasOption('t')) {
+ return ExtractionMethod.BASIC;
+ }
+ return ExtractionMethod.DECIDE;
+ }
+
+ private static TableExtractor createExtractor(CommandLine line) throws ParseException {
+ TableExtractor extractor = new TableExtractor();
+ extractor.setGuess(line.hasOption('g'));
+ extractor.setBluntGuess(line.hasOption('b'));
+ extractor.setMethod(CommandLineApp.whichExtractionMethod(line));
+ extractor.setUseLineReturns(line.hasOption('u'));
+
+ if (line.hasOption('c')) {
+ String optionString = line.getOptionValue('c');
+ if (optionString.startsWith("%")) {
+ extractor.setVerticalRulingPositionsRelative(true);
+ optionString = optionString.substring(1);
+ }
+ extractor.setVerticalRulingPositions(parseFloatList(optionString));
+ }
+
+ return extractor;
+ }
+
+ // utilities, etc.
public static List parseFloatList(String option) throws ParseException {
String[] f = option.split(",");
- List rv = new ArrayList();
+ List rv = new ArrayList<>();
try {
for (int i = 0; i < f.length; i++) {
rv.add(Float.parseFloat(f[i]));
@@ -230,53 +306,219 @@ private static void printHelp() {
formatter.printHelp("tabula", BANNER, buildOptions(), "", true);
}
- @SuppressWarnings("static-access")
public static Options buildOptions() {
Options o = new Options();
o.addOption("v", "version", false, "Print version and exit.");
o.addOption("h", "help", false, "Print this help text.");
o.addOption("g", "guess", false, "Guess the portion of the page to analyze per page.");
- o.addOption("d", "debug", false, "Print detected table areas instead of processing");
- o.addOption("r", "spreadsheet", false, "Force PDF to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)");
- o.addOption("n", "no-spreadsheet", false, "Force PDF not to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)");
+ o.addOption("b", "blunt", false, "Guess the portion of the page to analyze per page using blunt detection");
+ o.addOption("r", "spreadsheet", false, "[Deprecated in favor of -l/--lattice] Force PDF to be extracted using spreadsheet-style extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)");
+ o.addOption("n", "no-spreadsheet", false, "[Deprecated in favor of -t/--stream] Force PDF not to be extracted using spreadsheet-style extraction (if there are no ruling lines separating each cell)");
+ o.addOption("l", "lattice", false, "Force PDF to be extracted using lattice-mode extraction (if there are ruling lines separating each cell, as in a PDF of an Excel spreadsheet)");
+ o.addOption("t", "stream", false, "Force PDF to be extracted using stream-mode extraction (if there are no ruling lines separating each cell)");
o.addOption("i", "silent", false, "Suppress all stderr output.");
o.addOption("u", "use-line-returns", false, "Use embedded line returns in cells. (Only in spreadsheet mode.)");
- o.addOption("d", "debug", false, "Print detected table areas instead of processing.");
- o.addOption(OptionBuilder.withLongOpt("outfile")
- .withDescription("Write output to instead of STDOUT. Default: -")
- .hasArg()
- .withArgName("OUTFILE")
- .create("o"));
- o.addOption(OptionBuilder.withLongOpt("format")
- .withDescription("Output format: (" + Utils.join(",", OutputFormat.formatNames()) + "). Default: CSV")
- .hasArg()
- .withArgName("FORMAT")
- .create("f"));
- o.addOption(OptionBuilder.withLongOpt("password")
- .withDescription("Password to decrypt document. Default is empty")
- .hasArg()
- .withArgName("PASSWORD")
- .create("s"));
- o.addOption(OptionBuilder.withLongOpt("columns")
- .withDescription("X coordinates of column boundaries. Example --columns 10.1,20.2,30.3")
- .hasArg()
- .withArgName("COLUMNS")
- .create("c"));
- o.addOption(OptionBuilder.withLongOpt("area")
- .withDescription("Portion of the page to analyze (top,left,bottom,right). Example: --area 269.875,12.75,790.5,561. Default is entire page")
- .hasArg()
- .withArgName("AREA")
- .create("a"));
- o.addOption(OptionBuilder.withLongOpt("pages")
- .withDescription("Comma separated list of ranges, or all. Examples: --pages 1-3,5-7, --pages 3 or --pages all. Default is --pages 1")
- .hasArg()
- .withArgName("PAGES")
- .create("p"));
+ // o.addOption("d", "debug", false, "Print detected table areas instead of processing.");
+ o.addOption(Option.builder("m")
+ .longOpt("multiple")
+ .desc("Convert all .pdfs in the provided directory.")
+ .hasArg()
+ .argName("DIRECTORY")
+ .build());
+ o.addOption(Option.builder("o")
+ .longOpt("outfile")
+ .desc("Write output to instead of STDOUT. Default: -")
+ .hasArg()
+ .argName("OUTFILE")
+ .build());
+ o.addOption(Option.builder("f")
+ .longOpt("format")
+ .desc("Output format: (" + Utils.join(",", OutputFormat.formatNames()) + "). Default: CSV")
+ .hasArg()
+ .argName("FORMAT")
+ .build());
+ o.addOption(Option.builder("s")
+ .longOpt("password")
+ .desc("Password to decrypt document. Default is empty")
+ .hasArg()
+ .argName("PASSWORD")
+ .build());
+ o.addOption(Option.builder("c")
+ .longOpt("columns")
+ .desc("X coordinates of column boundaries. Example --columns 10.1,20.2,30.3. "
+ + "If all values are between 0-100 (inclusive) and preceded by '%', input will be taken as % of actual width of the page. "
+ + "Example: --columns %25,50,80.6")
+ .hasArg()
+ .argName("COLUMNS")
+ .build());
+ o.addOption(Option.builder("a")
+ .longOpt("area")
+ .desc("-a/--area = Portion of the page to analyze. Example: --area 269.875,12.75,790.5,561. "
+ + "Accepts top,left,bottom,right i.e. y1,x1,y2,x2 where all values are in points relative to the top left corner. "
+ + "If all values are between 0-100 (inclusive) and preceded by '%', input will be taken as % of actual height or width of the page. "
+ + "Example: --area %0,0,100,50. To specify multiple areas, -a option should be repeated. Default is entire page")
+ .hasArg()
+ .argName("AREA")
+ .build());
+ o.addOption(Option.builder("p")
+ .longOpt("pages")
+ .desc("Comma separated list of ranges, or all. Examples: --pages 1-3,5-7, --pages 3 or --pages all. Default is --pages 1")
+ .hasArg()
+ .argName("PAGES")
+ .build());
return o;
}
+ private static class TableExtractor {
+ private boolean bluntGuess = false;
+ private boolean guess = false;
+ private boolean useLineReturns = false;
+ private BasicExtractionAlgorithm basicExtractor = new BasicExtractionAlgorithm();
+ private SpreadsheetExtractionAlgorithm spreadsheetExtractor = new SpreadsheetExtractionAlgorithm();
+
+ private boolean verticalRulingPositionsRelative = false;
+ private List verticalRulingPositions = null;
+
+ private ExtractionMethod method = ExtractionMethod.BASIC;
+
+ public TableExtractor() {
+ }
+
+ public void setVerticalRulingPositions(List positions) {
+ this.verticalRulingPositions = positions;
+ }
+ public void setVerticalRulingPositionsRelative(boolean relative) {
+ this.verticalRulingPositionsRelative = relative;
+ }
+
+ public void setGuess(boolean guess) {
+ this.guess = guess;
+ }
+
+ public void setBluntGuess(boolean bluntGuess) {
+ this.bluntGuess = bluntGuess;
+ }
+
+ public void setUseLineReturns(boolean useLineReturns) {
+ this.useLineReturns = useLineReturns;
+ }
+
+ public void setMethod(ExtractionMethod method) {
+ this.method = method;
+ }
+
+ public List extractTables(Page page) {
+ ExtractionMethod effectiveMethod = this.method;
+ if (effectiveMethod == ExtractionMethod.DECIDE) {
+ effectiveMethod = spreadsheetExtractor.isTabular(page) ?
+ ExtractionMethod.SPREADSHEET :
+ ExtractionMethod.BASIC;
+ }
+ switch (effectiveMethod) {
+ case BASIC:
+ return extractTablesBasic(page);
+ case SPREADSHEET:
+ return extractTablesSpreadsheet(page);
+ default:
+ return new ArrayList<>();
+ }
+ }
+
+ public List extractTablesBasic(Page page) {
+ basicExtractor.setMixedTableExtractionEnabled(guess || bluntGuess);
+ if (guess || bluntGuess) {
+ // guess the page areas to extract using a detection algorithm
+ // currently we only have a detector that uses spreadsheets to find table areas
+ NurminenDetectionAlgorithm detector = new NurminenDetectionAlgorithm();
+ List guesses = detector.detect(page);
+ if (bluntGuess) {
+ Rectangle guess = detector.bluntDetect();
+ guesses = (guess == null) ? new ArrayList<>() : Collections.singletonList(guess);
+ }
+ List tables = new ArrayList<>();
+
+ for (Rectangle guessRect : guesses) {
+ Page guess = page.getArea(guessRect);
+ tables.addAll(basicExtractor.extract(guess));
+ }
+ return tables;
+ }
+
+ if (verticalRulingPositions != null) {
+ List absoluteRulingPositions;
+
+ if (this.verticalRulingPositionsRelative) {
+ // convert relative to absolute
+ absoluteRulingPositions = new ArrayList<>(verticalRulingPositions.size());
+ for (float relative: this.verticalRulingPositions) {
+ float absolute = (float)(relative / 100.0 * page.getWidth());
+ absoluteRulingPositions.add(absolute);
+ }
+ } else {
+ absoluteRulingPositions = this.verticalRulingPositions;
+ }
+ return basicExtractor.extract(page, absoluteRulingPositions);
+ }
+
+ return basicExtractor.extract(page);
+ }
+
+ public List extractTablesSpreadsheet(Page page) {
+ // TODO add useLineReturns
+ List tables = new ArrayList<>();
+
+ if (guess || bluntGuess) {
+ NurminenDetectionAlgorithm detector = new NurminenDetectionAlgorithm();
+ List guesses = detector.detect(page);
+ if (bluntGuess) {
+ Rectangle guess = detector.bluntDetect();
+ guesses = (guess == null) ? new ArrayList<>() : Collections.singletonList(guess);
+ }
+ for (Rectangle guessRect : guesses) {
+ Page guess = page.getArea(guessRect);
+ tables.addAll(spreadsheetExtractor.extract(guess));
+ }
+ return tables;
+ } else {
+ return spreadsheetExtractor.extract(page);
+ }
+ }
+ }
+
+ private void writeTables(List tables, Appendable out) throws IOException {
+ Writer writer = null;
+ switch (outputFormat) {
+ case CSV:
+ writer = new CSVWriter();
+ break;
+ case JSON:
+ writer = new JSONWriter();
+ break;
+ case TSV:
+ writer = new TSVWriter();
+ break;
+ }
+ writer.write(out, tables);
+ }
+
+ private String getOutputFilename(File pdfFile) {
+ String extension = ".csv";
+ switch (outputFormat) {
+ case CSV:
+ extension = ".csv";
+ break;
+ case JSON:
+ extension = ".json";
+ break;
+ case TSV:
+ extension = ".tsv";
+ break;
+ }
+ return pdfFile.getPath().replaceFirst("(\\.pdf|)$", extension);
+ }
+
private enum OutputFormat {
CSV,
TSV,
@@ -290,7 +532,6 @@ static String[] formatNames() {
}
return rv;
}
-
}
private enum ExtractionMethod {
diff --git a/src/main/java/technology/tabula/DummyGraphics2D.java b/src/main/java/technology/tabula/DummyGraphics2D.java
deleted file mode 100644
index 88026fec..00000000
--- a/src/main/java/technology/tabula/DummyGraphics2D.java
+++ /dev/null
@@ -1,461 +0,0 @@
-package technology.tabula;
-
-import java.awt.Color;
-import java.awt.Composite;
-import java.awt.Font;
-import java.awt.FontMetrics;
-import java.awt.Graphics;
-import java.awt.Graphics2D;
-import java.awt.GraphicsConfiguration;
-import java.awt.Image;
-import java.awt.Paint;
-import java.awt.Rectangle;
-import java.awt.RenderingHints;
-import java.awt.RenderingHints.Key;
-import java.awt.Shape;
-import java.awt.Stroke;
-import java.awt.font.FontRenderContext;
-import java.awt.font.GlyphVector;
-import java.awt.geom.AffineTransform;
-import java.awt.image.BufferedImage;
-import java.awt.image.BufferedImageOp;
-import java.awt.image.ImageObserver;
-import java.awt.image.RenderedImage;
-import java.awt.image.renderable.RenderableImage;
-import java.text.AttributedCharacterIterator;
-import java.util.Map;
-
-public class DummyGraphics2D extends Graphics2D {
-
- @Override
- public void addRenderingHints(Map, ?> hints) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void clip(Shape s) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void draw(Shape s) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawGlyphVector(GlyphVector g, float x, float y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public boolean drawImage(Image img, AffineTransform xform, ImageObserver obs) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public void drawImage(BufferedImage img, BufferedImageOp op, int x, int y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawRenderableImage(RenderableImage img, AffineTransform xform) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawRenderedImage(RenderedImage img, AffineTransform xform) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawString(String str, int x, int y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawString(String str, float x, float y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawString(AttributedCharacterIterator iterator, int x, int y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawString(AttributedCharacterIterator iterator, float x,
- float y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fill(Shape s) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public Color getBackground() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Composite getComposite() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public GraphicsConfiguration getDeviceConfiguration() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public FontRenderContext getFontRenderContext() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Paint getPaint() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Object getRenderingHint(Key hintKey) {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public RenderingHints getRenderingHints() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Stroke getStroke() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public AffineTransform getTransform() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public boolean hit(Rectangle rect, Shape s, boolean onStroke) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public void rotate(double theta) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void rotate(double theta, double x, double y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void scale(double sx, double sy) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setBackground(Color color) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setComposite(Composite comp) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setPaint(Paint paint) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setRenderingHint(Key hintKey, Object hintValue) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setRenderingHints(Map, ?> hints) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setStroke(Stroke s) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setTransform(AffineTransform Tx) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void shear(double shx, double shy) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void transform(AffineTransform Tx) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void translate(int x, int y) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void translate(double tx, double ty) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void clearRect(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void clipRect(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void copyArea(int x, int y, int width, int height, int dx, int dy) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public Graphics create() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public void dispose() {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawArc(int x, int y, int width, int height, int startAngle,
- int arcAngle) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public boolean drawImage(Image img, int x, int y, ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public boolean drawImage(Image img, int x, int y, Color bgcolor,
- ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public boolean drawImage(Image img, int x, int y, int width, int height,
- ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public boolean drawImage(Image img, int x, int y, int width, int height,
- Color bgcolor, ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public boolean drawImage(Image img, int dx1, int dy1, int dx2, int dy2,
- int sx1, int sy1, int sx2, int sy2, ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public boolean drawImage(Image img, int dx1, int dy1, int dx2, int dy2,
- int sx1, int sy1, int sx2, int sy2, Color bgcolor,
- ImageObserver observer) {
- // TODO Auto-generated method stub
- return false;
- }
-
- @Override
- public void drawLine(int x1, int y1, int x2, int y2) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawOval(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawPolygon(int[] xPoints, int[] yPoints, int nPoints) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawPolyline(int[] xPoints, int[] yPoints, int nPoints) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void drawRoundRect(int x, int y, int width, int height,
- int arcWidth, int arcHeight) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fillArc(int x, int y, int width, int height, int startAngle,
- int arcAngle) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fillOval(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fillPolygon(int[] xPoints, int[] yPoints, int nPoints) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fillRect(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void fillRoundRect(int x, int y, int width, int height,
- int arcWidth, int arcHeight) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public Shape getClip() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Rectangle getClipBounds() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Color getColor() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public Font getFont() {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public FontMetrics getFontMetrics(Font f) {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public void setClip(Shape clip) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setClip(int x, int y, int width, int height) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setColor(Color c) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setFont(Font font) {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setPaintMode() {
- // TODO Auto-generated method stub
-
- }
-
- @Override
- public void setXORMode(Color c1) {
- // TODO Auto-generated method stub
-
- }
-
-}
diff --git a/src/main/java/technology/tabula/HasText.java b/src/main/java/technology/tabula/HasText.java
index 6f375dbc..99455afb 100644
--- a/src/main/java/technology/tabula/HasText.java
+++ b/src/main/java/technology/tabula/HasText.java
@@ -1,7 +1,7 @@
package technology.tabula;
public interface HasText {
-
- String getText();
+
+ String getText();
}
diff --git a/src/main/java/technology/tabula/Line.java b/src/main/java/technology/tabula/Line.java
index ed2f6895..31d10529 100644
--- a/src/main/java/technology/tabula/Line.java
+++ b/src/main/java/technology/tabula/Line.java
@@ -8,7 +8,7 @@
@SuppressWarnings("serial")
public class Line extends Rectangle {
- List textChunks = new ArrayList();
+ List textChunks = new ArrayList<>();
public static final Character[] WHITE_SPACE_CHARS = { ' ', '\t', '\r', '\n', '\f' };
@@ -52,7 +52,7 @@ public void addTextChunk(TextChunk textChunk) {
public String toString() {
StringBuilder sb = new StringBuilder();
String s = super.toString();
- sb.append(s.substring(0, s.length() - 1));
+ sb.append(s, 0, s.length() - 1);
sb.append(",chunks=");
for (TextChunk te: this.textChunks) {
sb.append("'" + te.getText() + "', ");
diff --git a/src/main/java/technology/tabula/ObjectExtractor.java b/src/main/java/technology/tabula/ObjectExtractor.java
index b04927fb..3998ba6f 100644
--- a/src/main/java/technology/tabula/ObjectExtractor.java
+++ b/src/main/java/technology/tabula/ObjectExtractor.java
@@ -1,132 +1,49 @@
package technology.tabula;
-import java.awt.Image;
-import java.awt.Shape;
-import java.awt.event.KeyEvent;
-import java.awt.geom.AffineTransform;
-import java.awt.geom.GeneralPath;
-import java.awt.geom.Line2D;
-import java.awt.geom.PathIterator;
-import java.awt.geom.Point2D;
-import java.awt.geom.Rectangle2D;
import java.io.IOException;
-import java.lang.reflect.Field;
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.List;
-import org.apache.pdfbox.exceptions.CryptographyException;
-import org.apache.pdfbox.pdfviewer.PageDrawer;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
-import org.apache.pdfbox.pdmodel.common.PDRectangle;
-import org.apache.pdfbox.pdmodel.common.PDStream;
-import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
-import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
-import org.apache.pdfbox.pdmodel.font.PDFont;
-import org.apache.pdfbox.pdmodel.font.PDType3Font;
-import org.apache.pdfbox.pdmodel.graphics.PDGraphicsState;
-import org.apache.pdfbox.pdmodel.text.PDTextState;
-import org.apache.pdfbox.util.TextPosition;
-public class ObjectExtractor extends org.apache.pdfbox.pdfviewer.PageDrawer {
+public class ObjectExtractor {
- private static final char[] spaceLikeChars = { ' ', '-', '1', 'i' };
- private static final String NBSP = "\u00A0";
+ private final PDDocument pdfDocument;
- private float minCharWidth;
- private float minCharHeight;
- private List characters;
- private List rulings;
- private RectangleSpatialIndex spatialIndex;
- private AffineTransform pageTransform;
- public List clippingPaths;
- private boolean debugClippingPaths;
- private boolean extractRulingLines;
- private final PDDocument pdf_document;
- protected List pdf_document_pages;
-
-
- public ObjectExtractor(PDDocument pdf_document) throws IOException {
- this(pdf_document, null, true, false);
+ public ObjectExtractor(PDDocument pdfDocument) {
+ this.pdfDocument = pdfDocument;
}
- public ObjectExtractor(PDDocument pdf_document, boolean debugClippingPaths) throws IOException {
- this(pdf_document, null, true, debugClippingPaths);
- }
-
- public ObjectExtractor(PDDocument pdf_document, String password) throws IOException {
- this(pdf_document, password, true, false);
- }
+ protected Page extractPage(Integer pageNumber) throws IOException {
- public ObjectExtractor(PDDocument pdf_document, String password, boolean extractRulingLines, boolean debugClippingPaths)
- throws IOException {
- super();
-
- this.clippingPaths = new ArrayList();
- this.debugClippingPaths = debugClippingPaths;
- this.extractRulingLines = extractRulingLines;
-
- this.initialize();
-
- // patch PageDrawer: dummy Graphics2D context so some drawing operators don't complain
- try {
- Field field = PageDrawer.class.getDeclaredField("graphics");
- field.setAccessible(true);
- field.set(this, new DummyGraphics2D());
- }
- catch (Exception e1) {
- }
-
- if (pdf_document.isEncrypted()) {
- try {
- pdf_document
- .openProtection(new StandardDecryptionMaterial(password));
- } catch (BadSecurityHandlerException e) {
- // TODO Auto-generated catch block
- throw new IOException("BadSecurityHandler");
- } catch (CryptographyException e) {
- throw new IOException("Document is encrypted");
- }
+ if (pageNumber > this.pdfDocument.getNumberOfPages() || pageNumber < 1) {
+ throw new java.lang.IndexOutOfBoundsException(
+ "Page number does not exist");
}
- this.pdf_document = pdf_document;
- this.pdf_document_pages = this.pdf_document.getDocumentCatalog()
- .getAllPages();
- }
+ PDPage p = this.pdfDocument.getPage(pageNumber - 1);
+ ObjectExtractorStreamEngine se = new ObjectExtractorStreamEngine(p);
+ se.processPage(p);
- protected Page extractPage(Integer page_number) throws IOException {
- if (page_number > this.pdf_document_pages.size() || page_number < 1) {
- throw new java.lang.IndexOutOfBoundsException(
- "Page number does not exist");
- }
- this.initialize();
+ TextStripper pdfTextStripper = new TextStripper(this.pdfDocument, pageNumber);
+
+ pdfTextStripper.process();
- PDPage pdPage = (PDPage) this.pdf_document_pages.get(page_number - 1);
- pdPage = this.drawPage(pdPage);
-
- if(pdPage != null) {
-
- Utils.sort(this.characters);
-
- float w, h;
- int pageRotation = pdPage.findRotation();
- if (Math.abs(pageRotation) == 90 || Math.abs(pageRotation) == 270) {
- w = pdPage.findCropBox().getHeight();
- h = pdPage.findCropBox().getWidth();
- }
- else {
- w = pdPage.findCropBox().getWidth();
- h = pdPage.findCropBox().getHeight();
- }
-
- return new Page(0, 0, w, h, pageRotation, page_number, pdPage, this.characters,
- this.rulings, this.minCharWidth, this.minCharHeight,
- this.spatialIndex);
+ Utils.sort(pdfTextStripper.textElements, Rectangle.ILL_DEFINED_ORDER);
+
+ float w, h;
+ int pageRotation = p.getRotation();
+ if (Math.abs(pageRotation) == 90 || Math.abs(pageRotation) == 270) {
+ w = p.getCropBox().getHeight();
+ h = p.getCropBox().getWidth();
+ } else {
+ w = p.getCropBox().getWidth();
+ h = p.getCropBox().getHeight();
}
- return null;//TODO: content is empty, return null? or empty Page? or exception?
+
+ return new Page(0, 0, w, h, pageRotation, pageNumber, p, this.pdfDocument, pdfTextStripper.textElements,
+ se.rulings, pdfTextStripper.minCharWidth, pdfTextStripper.minCharHeight, pdfTextStripper.spatialIndex);
}
public PageIterator extract(Iterable pages) {
@@ -134,7 +51,7 @@ public PageIterator extract(Iterable pages) {
}
public PageIterator extract() {
- return extract(Utils.range(1, this.pdf_document_pages.size() + 1));
+ return extract(Utils.range(1, this.pdfDocument.getNumberOfPages() + 1));
}
public Page extract(int pageNumber) {
@@ -142,286 +59,9 @@ public Page extract(int pageNumber) {
}
public void close() throws IOException {
- this.pdf_document.close();
- }
-
- private PDPage drawPage(PDPage p) throws IOException {
- this.page = p;
- PDStream contents = p.getContents();
- if (contents != null) {
- ensurePageSize();
- this.processStream(p, p.findResources(), contents.getStream());
- return p;
- }
- return null;
- }
-
- private void ensurePageSize() {
- if (this.pageSize == null && this.page != null) {
- PDRectangle cropBox = this.page.findCropBox();
- this.pageSize = cropBox == null ? null : cropBox
- .createDimension();
- }
- }
-
- private void initialize() {
- this.characters = new ArrayList();
- this.rulings = new ArrayList();
- this.pageTransform = null;
- this.spatialIndex = new RectangleSpatialIndex();
- this.minCharWidth = Float.MAX_VALUE;
- this.minCharHeight = Float.MAX_VALUE;
- }
-
- @Override
- public void drawImage(Image awtImage, AffineTransform at) {
- // we just ignore images (for now)
- }
-
- public void strokeOrFillPath(boolean isFill) {
- GeneralPath path = this.getLinePath();
-
- if (!this.extractRulingLines) {
- this.getLinePath().reset();
- return;
- }
-
- PathIterator pi = path.getPathIterator(this.getPageTransform());
- float[] c = new float[6];
- int currentSegment;
-
- // skip paths whose first operation is not a MOVETO
- // or contains operations other than LINETO, MOVETO or CLOSE
- if ((pi.currentSegment(c) != PathIterator.SEG_MOVETO)) {
- path.reset();
- return;
- }
- pi.next();
- while (!pi.isDone()) {
- currentSegment = pi.currentSegment(c);
- if (currentSegment != PathIterator.SEG_LINETO
- && currentSegment != PathIterator.SEG_CLOSE
- && currentSegment != PathIterator.SEG_MOVETO) {
- path.reset();
- return;
- }
- pi.next();
- }
-
- // TODO: how to implement color filter?
-
- // skip the first path operation and save it as the starting position
- float[] first = new float[6];
- pi = path.getPathIterator(this.getPageTransform());
- pi.currentSegment(first);
- // last move
- Point2D.Float start_pos = new Point2D.Float(Utils.round(first[0], 2), Utils.round(first[1], 2));
- Point2D.Float last_move = start_pos;
- Point2D.Float end_pos = null;
- Line2D.Float line;
- PointComparator pc = new PointComparator();
-
- while (!pi.isDone()) {
- pi.next();
- currentSegment = pi.currentSegment(c);
- switch (currentSegment) {
- case PathIterator.SEG_LINETO:
- end_pos = new Point2D.Float(c[0], c[1]);
-
- line = pc.compare(start_pos, end_pos) == -1 ? new Line2D.Float(
- start_pos, end_pos) : new Line2D.Float(end_pos,
- start_pos);
-
- if (line.intersects(this.currentClippingPath())) {
- Ruling r = new Ruling(line.getP1(), line.getP2())
- .intersect(this.currentClippingPath());
-
- if (r.length() > 0.01) {
- this.rulings.add(r);
- }
- }
- break;
- case PathIterator.SEG_MOVETO:
- last_move = new Point2D.Float(c[0], c[1]);
- end_pos = last_move;
- break;
- case PathIterator.SEG_CLOSE:
- // according to PathIterator docs:
- // "the preceding subpath should be closed by appending a line
- // segment
- // back to the point corresponding to the most recent
- // SEG_MOVETO."
- line = pc.compare(end_pos, last_move) == -1 ? new Line2D.Float(
- end_pos, last_move) : new Line2D.Float(last_move,
- end_pos);
-
- if (line.intersects(this.currentClippingPath())) {
- Ruling r = new Ruling(line.getP1(), line.getP2())
- .intersect(this.currentClippingPath());
-
- if (r.length() > 0.01) {
- this.rulings.add(r);
- }
- }
- break;
- }
- start_pos = end_pos;
- }
- path.reset();
- }
-
- @Override
- public void strokePath() throws IOException {
- this.strokeOrFillPath(false);
- }
-
- @Override
- public void fillPath(int windingRule) throws IOException {
- //
- // float[] color_comps =
- // this.getGraphicsState().getNonStrokingColor().getJavaColor().getRGBColorComponents(null);
- float[] color = this.getGraphicsState().getNonStrokingColor().getJavaColor().getComponents(null);
- // TODO use color_comps as filter_by_color
- this.strokeOrFillPath(true);
+ this.pdfDocument.close();
}
- private float currentSpaceWidth() {
- PDGraphicsState gs = this.getGraphicsState();
- PDTextState ts = gs.getTextState();
- PDFont font = ts.getFont();
- float fontSizeText = ts.getFontSize();
- float horizontalScalingText = ts.getHorizontalScalingPercent() / 100.0f;
- float spaceWidthText = 1000;
- if (font instanceof PDType3Font) {
- // TODO WHAT?
- }
-
- for (int i = 0; i < spaceLikeChars.length; i++) {
- spaceWidthText = font.getFontWidth(spaceLikeChars[i]);
- if (spaceWidthText > 0)
- break;
- }
-
- float ctm00 = gs.getCurrentTransformationMatrix().getValue(0, 0);
-
- return (float) ((spaceWidthText / 1000.0) * fontSizeText
- * horizontalScalingText * (ctm00 == 0 ? 1 : ctm00));
- }
-
- @Override
- protected void processTextPosition(TextPosition textPosition) {
- String c = textPosition.getCharacter();
-
- // if c not printable, return
- if (!isPrintable(c)) {
- return;
- }
-
- Float h = textPosition.getHeightDir();
-
- if (c.equals(NBSP)) { // replace non-breaking space for space
- c = " ";
- }
-
- float wos = textPosition.getWidthOfSpace();
-
- TextElement te = new TextElement(
- Utils.round(textPosition.getYDirAdj() - h, 2),
- Utils.round(textPosition.getXDirAdj(), 2),
- Utils.round(textPosition.getWidthDirAdj(), 2),
- Utils.round(textPosition.getHeightDir(), 2),
- textPosition.getFont(),
- textPosition.getFontSize(),
- c,
- // workaround a possible bug in PDFBox:
- // https://issues.apache.org/jira/browse/PDFBOX-1755
- (Float.isNaN(wos) || wos == 0) ? this.currentSpaceWidth() : wos,
- textPosition.getDir());
-
- if (this.currentClippingPath().intersects(te)) {
-
- this.minCharWidth = (float) Math.min(this.minCharWidth, te.getWidth());
- this.minCharHeight = (float) Math.min(this.minCharHeight, te.getHeight());
-
- this.spatialIndex.add(te);
- this.characters.add(te);
- }
-
- if (this.isDebugClippingPaths() && !this.clippingPaths.contains(this.currentClippingPath())) {
- this.clippingPaths.add(this.currentClippingPath());
- }
-
- }
-
- public AffineTransform getPageTransform() {
-
- if (this.pageTransform != null) {
- return this.pageTransform;
- }
-
- PDRectangle cb = page.findCropBox();
- int rotation = Math.abs(page.findRotation());
-
- this.pageTransform = new AffineTransform();
-
- if (rotation == 90 || rotation == 270) {
- this.pageTransform = AffineTransform.getRotateInstance(rotation * (Math.PI / 180.0), 0, 0);
- this.pageTransform.concatenate(AffineTransform.getScaleInstance(1, -1));
- this.pageTransform.concatenate(AffineTransform.getTranslateInstance(0, cb.getHeight()));
- this.pageTransform.concatenate(AffineTransform.getScaleInstance(1, -1));
- }
- return this.pageTransform;
- }
-
- public Rectangle2D currentClippingPath() {
-
- Shape clippingPath = this.getGraphicsState().getCurrentClippingPath();
- Shape transformedClippingPath = this.getPageTransform()
- .createTransformedShape(clippingPath);
- Rectangle2D transformedClippingPathBounds = transformedClippingPath
- .getBounds2D();
-
- return transformedClippingPathBounds;
- }
-
- public boolean isExtractRulingLines() {
- return extractRulingLines;
- }
-
- private static boolean isPrintable(String s) {
- Character c = s.charAt(0);
- Character.UnicodeBlock block = Character.UnicodeBlock.of(c);
- return (!Character.isISOControl(c)) && c != KeyEvent.CHAR_UNDEFINED
- && block != null && block != Character.UnicodeBlock.SPECIALS;
- }
-
- public boolean isDebugClippingPaths() {
- return debugClippingPaths;
- }
-
- public int getPageCount() {
- return this.pdf_document_pages.size();
- }
-
- class PointComparator implements Comparator {
- @Override
- public int compare(Point2D o1, Point2D o2) {
- float o1X = Utils.round(o1.getX(), 2);
- float o1Y = Utils.round(o1.getY(), 2);
- float o2X = Utils.round(o2.getX(), 2);
- float o2Y = Utils.round(o2.getY(), 2);
-
- if (o1Y > o2Y)
- return 1;
- if (o1Y < o2Y)
- return -1;
- if (o1X > o2X)
- return 1;
- if (o1X < o2X)
- return -1;
- return 0;
- }
- }
}
diff --git a/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java b/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java
new file mode 100644
index 00000000..797cb18c
--- /dev/null
+++ b/src/main/java/technology/tabula/ObjectExtractorStreamEngine.java
@@ -0,0 +1,272 @@
+package technology.tabula;
+
+import java.awt.Shape;
+import java.awt.geom.AffineTransform;
+import java.awt.geom.GeneralPath;
+import java.awt.geom.Line2D;
+import java.awt.geom.PathIterator;
+import java.awt.geom.Point2D;
+import java.awt.geom.Rectangle2D;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.List;
+
+import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine;
+import org.apache.pdfbox.cos.COSName;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.common.PDRectangle;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
+import org.apache.pdfbox.util.Matrix;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Detects all existing lines in pdf and sets them to rulings
+ */
+class ObjectExtractorStreamEngine extends PDFGraphicsStreamEngine {
+
+ protected List rulings;
+ private AffineTransform pageTransform;
+ private boolean debugClippingPaths;
+ private boolean extractRulingLines = true;
+ private Logger log;
+ private int clipWindingRule = -1;
+ private GeneralPath currentPath = new GeneralPath();
+
+ protected ObjectExtractorStreamEngine(PDPage page) {
+ super(page);
+
+ this.log = LoggerFactory.getLogger(ObjectExtractorStreamEngine.class);
+
+ this.rulings = new ArrayList<>();
+ this.pageTransform = null;
+
+ // calculate page transform
+ PDRectangle cb = this.getPage().getCropBox();
+ int rotation = this.getPage().getRotation();
+
+ this.pageTransform = new AffineTransform();
+
+ if (Math.abs(rotation) == 90 || Math.abs(rotation) == 270) {
+ this.pageTransform = AffineTransform.getRotateInstance(rotation * (Math.PI / 180.0), 0, 0);
+ this.pageTransform.concatenate(AffineTransform.getScaleInstance(1, -1));
+ } else {
+ this.pageTransform.concatenate(AffineTransform.getTranslateInstance(0, cb.getHeight()));
+ this.pageTransform.concatenate(AffineTransform.getScaleInstance(1, -1));
+ }
+
+ this.pageTransform.translate(-cb.getLowerLeftX(), -cb.getLowerLeftY());
+ }
+
+ @Override
+ public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) {
+ currentPath.moveTo((float) p0.getX(), (float) p0.getY());
+ currentPath.lineTo((float) p1.getX(), (float) p1.getY());
+ currentPath.lineTo((float) p2.getX(), (float) p2.getY());
+ currentPath.lineTo((float) p3.getX(), (float) p3.getY());
+
+ currentPath.closePath();
+ }
+
+ @Override
+ public void clip(int windingRule) {
+ // the clipping path will not be updated until the succeeding painting
+ // operator is called
+ clipWindingRule = windingRule;
+ }
+
+ @Override
+ public void closePath() {
+ currentPath.closePath();
+ }
+
+ @Override
+ public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) {
+ currentPath.curveTo(x1, y1, x2, y2, x3, y3);
+ }
+
+ @Override
+ public void drawImage(PDImage arg0) {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void endPath() {
+ if (clipWindingRule != -1) {
+ currentPath.setWindingRule(clipWindingRule);
+ getGraphicsState().intersectClippingPath(currentPath);
+ clipWindingRule = -1;
+ }
+ currentPath.reset();
+ }
+
+ @Override
+ public void fillAndStrokePath(int arg0) {
+ strokeOrFillPath(true);
+ }
+
+ @Override
+ public void fillPath(int arg0) {
+ strokeOrFillPath(true);
+ }
+
+ @Override
+ public Point2D getCurrentPoint() {
+ return currentPath.getCurrentPoint();
+ }
+
+ @Override
+ public void lineTo(float x, float y) {
+ currentPath.lineTo(x, y);
+ }
+
+ @Override
+ public void moveTo(float x, float y) {
+ currentPath.moveTo(x, y);
+ }
+
+ @Override
+ public void shadingFill(COSName arg0) {
+ // TODO Auto-generated method stub
+
+ }
+
+ @Override
+ public void strokePath() {
+ strokeOrFillPath(false);
+ }
+
+ private void strokeOrFillPath(boolean isFill) {
+ GeneralPath path = this.currentPath;
+
+ if (!this.extractRulingLines) {
+ this.currentPath.reset();
+ return;
+ }
+
+ PathIterator pi = path.getPathIterator(this.getPageTransform());
+ float[] c = new float[6];
+ int currentSegment;
+
+ // skip paths whose first operation is not a MOVETO
+ // or contains operations other than LINETO, MOVETO or CLOSE
+ if ((pi.currentSegment(c) != PathIterator.SEG_MOVETO)) {
+ path.reset();
+ return;
+ }
+ pi.next();
+ while (!pi.isDone()) {
+ currentSegment = pi.currentSegment(c);
+ if (currentSegment != PathIterator.SEG_LINETO && currentSegment != PathIterator.SEG_CLOSE
+ && currentSegment != PathIterator.SEG_MOVETO) {
+ path.reset();
+ return;
+ }
+ pi.next();
+ }
+
+ // TODO: how to implement color filter?
+
+ // skip the first path operation and save it as the starting position
+ float[] first = new float[6];
+ pi = path.getPathIterator(this.getPageTransform());
+ pi.currentSegment(first);
+ // last move
+ Point2D.Float start_pos = new Point2D.Float(Utils.round(first[0], 2), Utils.round(first[1], 2));
+ Point2D.Float last_move = start_pos;
+ Point2D.Float end_pos = null;
+ Line2D.Float line;
+ PointComparator pc = new PointComparator();
+ while (!pi.isDone()) {
+ pi.next();
+ // This can be the last segment, when pi.isDone, but we need to
+ // process it
+ // otherwise us-017.pdf fails the last value.
+ try {
+ currentSegment = pi.currentSegment(c);
+ } catch (IndexOutOfBoundsException ex) {
+ continue;
+ }
+ switch (currentSegment) {
+ case PathIterator.SEG_LINETO:
+ end_pos = new Point2D.Float(c[0], c[1]);
+
+ if (start_pos == null || end_pos == null) {
+ break;
+ }
+
+ line = pc.compare(start_pos, end_pos) == -1 ? new Line2D.Float(start_pos, end_pos)
+ : new Line2D.Float(end_pos, start_pos);
+
+ if (line.intersects(this.currentClippingPath())) {
+ Ruling r = new Ruling(line.getP1(), line.getP2()).intersect(this.currentClippingPath());
+
+ if (r.length() > 0.01) {
+ this.rulings.add(r);
+ }
+ }
+ break;
+ case PathIterator.SEG_MOVETO:
+ last_move = new Point2D.Float(c[0], c[1]);
+ end_pos = last_move;
+ break;
+ case PathIterator.SEG_CLOSE:
+ // according to PathIterator docs:
+ // "the preceding subpath should be closed by appending a line
+ // segment
+ // back to the point corresponding to the most recent
+ // SEG_MOVETO."
+ if (start_pos == null || end_pos == null) {
+ break;
+ }
+ line = pc.compare(end_pos, last_move) == -1 ? new Line2D.Float(end_pos, last_move)
+ : new Line2D.Float(last_move, end_pos);
+
+ if (line.intersects(this.currentClippingPath())) {
+ // intersect clips the line
+ Ruling r = new Ruling(line.getP1(), line.getP2()).intersect(this.currentClippingPath());
+
+ if (r.length() > 0.01) {
+ this.rulings.add(r);
+ }
+ }
+ break;
+ }
+ start_pos = end_pos;
+ }
+ path.reset();
+ }
+
+ public AffineTransform getPageTransform() {
+ return this.pageTransform;
+ }
+
+ public Rectangle2D currentClippingPath() {
+ Shape clippingPath = this.getGraphicsState().getCurrentClippingPath();
+ Shape transformedClippingPath = this.getPageTransform().createTransformedShape(clippingPath);
+
+ return transformedClippingPath.getBounds2D();
+ }
+
+ class PointComparator implements Comparator {
+ @Override
+ public int compare(Point2D o1, Point2D o2) {
+ float o1X = Utils.round(o1.getX(), 2);
+ float o1Y = Utils.round(o1.getY(), 2);
+ float o2X = Utils.round(o2.getX(), 2);
+ float o2Y = Utils.round(o2.getY(), 2);
+
+ if (o1Y > o2Y)
+ return 1;
+ if (o1Y < o2Y)
+ return -1;
+ if (o1X > o2X)
+ return 1;
+ if (o1X < o2X)
+ return -1;
+ return 0;
+ }
+ }
+}
diff --git a/src/main/java/technology/tabula/Page.java b/src/main/java/technology/tabula/Page.java
index ab57d938..ac670558 100644
--- a/src/main/java/technology/tabula/Page.java
+++ b/src/main/java/technology/tabula/Page.java
@@ -2,235 +2,250 @@
import java.awt.geom.Point2D;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
+import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
@SuppressWarnings("serial")
// TODO: this class should probably be called "PageArea" or something like that
public class Page extends Rectangle {
- private Integer rotation;
- private int pageNumber;
- private List texts;
- private List rulings, cleanRulings = null, verticalRulingLines = null, horizontalRulingLines = null;
- private float minCharWidth;
- private float minCharHeight;
- private RectangleSpatialIndex spatial_index;
- private PDPage pdPage;
-
- public Page(float top, float left, float width, float height, int rotation, int page_number, PDPage pdPage) {
- super(top, left, width, height);
- this.rotation = rotation;
- this.pageNumber = page_number;
- this.pdPage = pdPage;
- }
-
- public Page(float top, float left, float width, float height, int rotation, int page_number, PDPage pdPage,
- List characters, List rulings) {
-
- this(top, left, width, height, rotation, page_number, pdPage);
- this.texts = characters;
- this.rulings = rulings;
- }
-
-
- public Page(float top, float left, float width, float height, int rotation, int page_number, PDPage pdPage,
- List characters, List rulings,
- float minCharWidth, float minCharHeight, RectangleSpatialIndex index) {
-
- this(top, left, width, height, rotation, page_number, pdPage, characters, rulings);
- this.minCharHeight = minCharHeight;
- this.minCharWidth = minCharWidth;
- this.spatial_index = index;
- }
+ private Integer rotation;
+ private int pageNumber;
+ private List texts;
+ private List rulings, cleanRulings = null, verticalRulingLines = null, horizontalRulingLines = null;
+ private float minCharWidth;
+ private float minCharHeight;
+ private RectangleSpatialIndex spatial_index;
+ private PDPage pdPage;
+ private PDDocument pdDoc;
+
+ public Page(float top, float left, float width, float height, int rotation, int page_number, PDPage pdPage, PDDocument doc) {
+ super(top, left, width, height);
+ this.rotation = rotation;
+ this.pageNumber = page_number;
+ this.pdPage = pdPage;
+ this.pdDoc = doc;
+ }
+
+ public Page(float top, float left, float width, float height, int rotation, int page_number, PDPage pdPage, PDDocument doc,
+ List characters, List rulings) {
+
+ this(top, left, width, height, rotation, page_number, pdPage, doc);
+ this.texts = characters;
+ this.rulings = rulings;
+ }
+
+
+ public Page(float top, float left, float width, float height, int rotation, int page_number, PDPage pdPage, PDDocument doc,
+ List characters, List rulings,
+ float minCharWidth, float minCharHeight, RectangleSpatialIndex index) {
+
+ this(top, left, width, height, rotation, page_number, pdPage, doc, characters, rulings);
+ this.minCharHeight = minCharHeight;
+ this.minCharWidth = minCharWidth;
+ this.spatial_index = index;
+ }
+
+ public Page getArea(Rectangle area) {
+ List t = getText(area);
+ float min_char_width = 7;
+ float min_char_height = 7;
+
+ if(t.size() > 0){
+ min_char_width = Collections.min(t, new Comparator() {
+ @Override
+ public int compare(TextElement te1, TextElement te2) {
+ return java.lang.Float.compare(te1.width, te2.width);
+ }}).width;
+ min_char_height = Collections.min(t, new Comparator() {
+ @Override
+ public int compare(TextElement te1, TextElement te2) {
+ return java.lang.Float.compare(te1.height, te2.height);
+ }}).height;
+ }
+ Page rv = new Page(
+ area.getTop(),
+ area.getLeft(),
+ (float) area.getWidth(),
+ (float) area.getHeight(),
+ rotation,
+ pageNumber,
+ pdPage,
+ pdDoc,
+ t,
+ Ruling.cropRulingsToArea(getRulings(), area),
+ min_char_width,
+ min_char_height,
+ spatial_index);
+
+ rv.addRuling(new Ruling(
+ new Point2D.Double(rv.getLeft(),
+ rv.getTop()),
+ new Point2D.Double(rv.getRight(),
+ rv.getTop())));
+ rv.addRuling(new Ruling(
+ new Point2D.Double(rv.getRight(),
+ rv.getTop()),
+ new Point2D.Double(rv.getRight(),
+ rv.getBottom())));
+ rv.addRuling(new Ruling(
+ new Point2D.Double(rv.getRight(),
+ rv.getBottom()),
+ new Point2D.Double(rv.getLeft(),
+ rv.getBottom())));
+ rv.addRuling(new Ruling(
+ new Point2D.Double(rv.getLeft(),
+ rv.getBottom()),
+ new Point2D.Double(rv.getLeft(),
+ rv.getTop())));
+
+ return rv;
+ }
+
+ public Page getArea(float top, float left, float bottom, float right) {
+ Rectangle area = new Rectangle(top, left, right - left, bottom - top);
+ return this.getArea(area);
+ }
+
+ public List getText() {
+ return texts;
+ }
+
+ public List getText(Rectangle area) {
+ return this.spatial_index.contains(area);
+ }
+
+ /** @deprecated use {@linkplain #getText(Rectangle)} instead */
+ @Deprecated public List getText(float top, float left, float bottom, float right) {
+ return this.getText(new Rectangle(top, left, right - left, bottom - top));
+ }
+
+ public Integer getRotation() {
+ return rotation;
+ }
+
+ public int getPageNumber() {
+ return pageNumber;
+ }
+
+ /** @deprecated use {@linkplain #getText()} instead */
+ @Deprecated public List getTexts() {
+ return texts;
+ }
+
+ /**
+ * Returns the minimum bounding box that contains all the TextElements on this Page
+ */
+ public Rectangle getTextBounds() {
+ List texts = this.getText();
+ if (!texts.isEmpty()) {
+ return Utils.bounds(texts);
+ }
+ else {
+ return new Rectangle();
+ }
+
+ }
+
+ /**
+ * @return Only vertical or horizontal rulings, duplicates are removed
+ */
+ public List getRulings() {
+ if (this.cleanRulings != null) {
+ return this.cleanRulings;
+ }
+
+ if (this.rulings == null || this.rulings.isEmpty()) {
+ this.verticalRulingLines = new ArrayList<>();
+ this.horizontalRulingLines = new ArrayList<>();
+ return new ArrayList<>();
+ }
+
+ Utils.snapPoints(this.rulings, this.minCharWidth, this.minCharHeight);
+
+ List vrs = new ArrayList<>();
+ for (Ruling vr: this.rulings) {
+ if (vr.vertical()) {
+ vrs.add(vr);
+ }
+ }
+ this.verticalRulingLines = Ruling.collapseOrientedRulings(vrs);
+
+ List hrs = new ArrayList<>();
+ for (Ruling hr: this.rulings) {
+ if (hr.horizontal()) {
+ hrs.add(hr);
+ }
+ }
+ this.horizontalRulingLines = Ruling.collapseOrientedRulings(hrs);
+
+ this.cleanRulings = new ArrayList<>(this.verticalRulingLines);
+ this.cleanRulings.addAll(this.horizontalRulingLines);
+
+ return this.cleanRulings;
+
+ }
+
+ public List getVerticalRulings() {
+ if (this.verticalRulingLines != null) {
+ return this.verticalRulingLines;
+ }
+ this.getRulings();
+ return this.verticalRulingLines;
+ }
-
- public Page getArea(Rectangle area) {
- List t = getText(area);
- Page rv = new Page(
- (float) area.getTop(),
- (float) area.getLeft(),
- (float) area.getWidth(),
- (float) area.getHeight(),
- rotation,
- pageNumber,
- pdPage,
- t,
- Ruling.cropRulingsToArea(getRulings(), area),
-
- Collections.min(t, new Comparator() {
- @Override
- public int compare(TextElement te1, TextElement te2) {
- return java.lang.Float.compare(te1.width, te2.width);
- }}).width,
-
- Collections.min(t, new Comparator() {
- @Override
- public int compare(TextElement te1, TextElement te2) {
- return java.lang.Float.compare(te1.height, te2.height);
- }}).height,
-
- spatial_index);
-
- rv.addRuling(new Ruling(
- new Point2D.Double(rv.getLeft(),
- rv.getTop()),
- new Point2D.Double(rv.getRight(),
- rv.getTop())));
- rv.addRuling(new Ruling(
- new Point2D.Double(rv.getRight(),
- rv.getTop()),
- new Point2D.Double(rv.getRight(),
- rv.getBottom())));
- rv.addRuling(new Ruling(
- new Point2D.Double(rv.getRight(),
- rv.getBottom()),
- new Point2D.Double(rv.getLeft(),
- rv.getBottom())));
- rv.addRuling(new Ruling(
- new Point2D.Double(rv.getLeft(),
- rv.getBottom()),
- new Point2D.Double(rv.getLeft(),
- rv.getTop())));
-
- return rv;
- }
-
- public Page getArea(float top, float left, float bottom, float right) {
- Rectangle area = new Rectangle(top, left, right - left, bottom - top);
- return this.getArea(area);
- }
-
- public List getText() {
- return texts;
- }
-
- public List getText(Rectangle area) {
- return this.spatial_index.contains(area);
- }
-
- public List getText(float top, float left, float bottom, float right) {
- return this.getText(new Rectangle(top, left, right - left, bottom - top));
+ public List getHorizontalRulings() {
+ if (this.horizontalRulingLines != null) {
+ return this.horizontalRulingLines;
}
+ this.getRulings();
+ return this.horizontalRulingLines;
+ }
- public Integer getRotation() {
- return rotation;
+ public void addRuling(Ruling r) {
+ if (r.oblique()) {
+ throw new UnsupportedOperationException("Can't add an oblique ruling");
}
+ this.rulings.add(r);
+ // clear caches
+ this.verticalRulingLines = null;
+ this.horizontalRulingLines = null;
+ this.cleanRulings = null;
+ }
- public int getPageNumber() {
- return pageNumber;
- }
+ public List getUnprocessedRulings() {
+ return this.rulings;
+ }
- public List getTexts() {
- return texts;
- }
-
- /**
- * Returns the minimum bounding box that contains all the TextElements on this Page
- */
- public Rectangle getTextBounds() {
- List texts = this.getText();
- if (!texts.isEmpty()) {
- return Utils.bounds(texts);
- }
- else {
- return new Rectangle();
- }
-
- }
+ /** @deprecated with no replacement */
+ @Deprecated public float getMinCharWidth() {
+ return minCharWidth;
+ }
- public List getRulings() {
- if (this.cleanRulings != null) {
- return this.cleanRulings;
- }
-
- if (this.rulings == null || this.rulings.isEmpty()) {
- this.verticalRulingLines = new ArrayList();
- this.horizontalRulingLines = new ArrayList();
- return new ArrayList();
- }
-
- Utils.snapPoints(this.rulings, this.minCharWidth, this.minCharHeight);
-
- List vrs = new ArrayList();
- for (Ruling vr: this.rulings) {
- if (vr.vertical()) {
- vrs.add(vr);
- }
- }
- this.verticalRulingLines = Ruling.collapseOrientedRulings(vrs);
-
- List hrs = new ArrayList();
- for (Ruling hr: this.rulings) {
- if (hr.horizontal()) {
- hrs.add(hr);
- }
- }
- this.horizontalRulingLines = Ruling.collapseOrientedRulings(hrs);
-
- this.cleanRulings = new ArrayList(this.verticalRulingLines);
- this.cleanRulings.addAll(this.horizontalRulingLines);
-
- return this.cleanRulings;
-
- }
-
- public List getVerticalRulings() {
- if (this.verticalRulingLines != null) {
- return this.verticalRulingLines;
- }
- this.getRulings();
- return this.verticalRulingLines;
- }
-
- public List getHorizontalRulings() {
- if (this.horizontalRulingLines != null) {
- return this.horizontalRulingLines;
- }
- this.getRulings();
- return this.horizontalRulingLines;
- }
-
- public void addRuling(Ruling r) {
- if (r.oblique()) {
- throw new UnsupportedOperationException("Can't add an oblique ruling");
- }
- this.rulings.add(r);
- // clear caches
- this.verticalRulingLines = null;
- this.horizontalRulingLines = null;
- this.cleanRulings = null;
- }
-
- public List getUnprocessedRulings() {
- return this.rulings;
- }
+ /** @deprecated with no replacement */
+ @Deprecated public float getMinCharHeight() {
+ return minCharHeight;
+ }
- public float getMinCharWidth() {
- return minCharWidth;
- }
+ public PDPage getPDPage() {
+ return pdPage;
+ }
- public float getMinCharHeight() {
- return minCharHeight;
- }
+ public PDDocument getPDDoc() {
+ return pdDoc;
+ }
- public PDPage getPDPage() {
- return pdPage;
- }
+ /** @deprecated with no replacement */
+ @Deprecated public RectangleSpatialIndex getSpatialIndex() {
+ return this.spatial_index;
+ }
- public RectangleSpatialIndex getSpatialIndex() {
- return this.spatial_index;
- }
-
- public boolean hasText() {
- return this.texts.size() > 0;
- }
-
-
+ /** @deprecated with no replacement */
+ @Deprecated public boolean hasText() {
+ return this.texts.size() > 0;
+ }
}
diff --git a/src/main/java/technology/tabula/Pair.java b/src/main/java/technology/tabula/Pair.java
new file mode 100644
index 00000000..d54cbbe5
--- /dev/null
+++ b/src/main/java/technology/tabula/Pair.java
@@ -0,0 +1,19 @@
+package technology.tabula;
+
+public class Pair {
+ private final L left;
+ private final R right;
+
+ public Pair(L left, R right) {
+ this.left = left;
+ this.right = right;
+ }
+
+ public L getLeft() {
+ return this.left;
+ }
+
+ public R getRight() {
+ return this.right;
+ }
+}
diff --git a/src/main/java/technology/tabula/ProjectionProfile.java b/src/main/java/technology/tabula/ProjectionProfile.java
index 6479964d..39ab9e41 100644
--- a/src/main/java/technology/tabula/ProjectionProfile.java
+++ b/src/main/java/technology/tabula/ProjectionProfile.java
@@ -5,6 +5,8 @@
import java.util.List;
+// NOTE: this class is currently not used by the extraction algorithms
+// keeping it for potential use.
public class ProjectionProfile {
public static final int DECIMAL_PLACES = 1; // fixed <-> float conversion precision
@@ -71,7 +73,7 @@ public float[] getHorizontalProjection() {
public float[] findVerticalSeparators(float minColumnWidth) {
boolean foundNarrower = false;
- List verticalSeparators = new ArrayList();
+ List verticalSeparators = new ArrayList<>();
for (Ruling r: area.getVerticalRulings()) {
if (r.length() / this.textBounds.getHeight() >= 0.95) {
verticalSeparators.add(toFixed(r.getPosition() - this.areaLeft));
@@ -103,7 +105,7 @@ public float[] findVerticalSeparators(float minColumnWidth) {
public float[] findHorizontalSeparators(float minRowHeight) {
boolean foundShorter = false;
- List horizontalSeparators = new ArrayList();
+ List horizontalSeparators = new ArrayList<>();
for (Ruling r: area.getHorizontalRulings()) {
System.out.println(r.length() / this.textBounds.getWidth());
if (r.length() / this.textBounds.getWidth() >= 0.95) {
@@ -134,7 +136,7 @@ public float[] findHorizontalSeparators(float minRowHeight) {
}
private static List findSeparatorsFromProjection(float[] derivative) {
- List separators = new ArrayList();
+ List separators = new ArrayList<>();
Integer lastNeg = null;
float s;
boolean positiveSlope = false;
@@ -165,7 +167,7 @@ public static float[] smooth(float[] data, int kernelSize) {
+ kernelSize / 2, data.length); j++) {
s += data[j];
}
- rv[i] = (float) Math.floor(s / (float) kernelSize);
+ rv[i] = (float) Math.floor(s / kernelSize);
}
}
return rv;
@@ -211,7 +213,7 @@ private static int toFixed(double value) {
}
private static double toDouble(int value) {
- return (double) value / Math.pow(10, DECIMAL_PLACES);
+ return value / Math.pow(10, DECIMAL_PLACES);
}
}
diff --git a/src/main/java/technology/tabula/QuickSort.java b/src/main/java/technology/tabula/QuickSort.java
index 21d26dd5..03388a15 100644
--- a/src/main/java/technology/tabula/QuickSort.java
+++ b/src/main/java/technology/tabula/QuickSort.java
@@ -16,94 +16,97 @@
*/
package technology.tabula;
+import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
+import java.util.RandomAccess;
import java.util.Stack;
/**
- * see http://de.wikipedia.org/wiki/Quicksort.
+ * An implementation of Quicksort.
+ *
+ * @see wikipedia
*
* @author UWe Pachler
*/
-public class QuickSort
-{
-
- private QuickSort()
- {
- }
-
- private static final Comparator extends Comparable> objComp = new Comparator()
- {
- public int compare(Comparable object1, Comparable object2)
- {
- return object1.compareTo(object2);
- }
- };
+public final class QuickSort {
+
+ private QuickSort() {
+ // utility
+ }
+
+ /**
+ * Sorts the given list according to natural order.
+ */
+ public static > void sort(List list) {
+ sort(list, QuickSort.naturalOrder()); // JAVA_8 replace with Comparator.naturalOrder() (and cleanup)
+ }
+
+ /**
+ * Sorts the given list using the given comparator.
+ */
+ public static void sort(List list, Comparator super T> comparator) {
+ if (list instanceof RandomAccess) {
+ quicksort(list, comparator);
+ } else {
+ List copy = new ArrayList<>(list);
+ quicksort(copy, comparator);
+ list.clear();
+ list.addAll(copy);
+ }
+ }
- /**
- * Sorts the given list using the given comparator.
- */
- public static void sort(List list, Comparator cmp)
- {
- quicksort(list, cmp);
- }
+ private static void quicksort(List list, Comparator super T> cmp) {
+ Stack stack = new Stack<>();
+ stack.push(0);
+ stack.push(list.size());
+ while (!stack.isEmpty()) {
+ int right = stack.pop();
+ int left = stack.pop();
+
+ if (right - left < 2) continue;
+ int p = left + ((right - left) / 2);
+ p = partition(list, cmp, p, left, right);
- /**
- * Sorts the given list using compareTo as comparator.
- */
- public static void sort(List list)
- {
- sort(list, (Comparator) objComp);
- }
+ stack.push(p + 1);
+ stack.push(right);
- private static void quicksort(List list, Comparator cmp)
- {
- Stack stack = new Stack();
- stack.push(0);
- stack.push(list.size());
- while (!stack.isEmpty()) {
- int right = stack.pop();
- int left = stack.pop();
- if (right - left < 2) continue;
- int p = left + ((right-left)/2);
- p = partition(list, cmp, p, left, right);
-
- stack.push(p+1);
- stack.push(right);
+ stack.push(left);
+ stack.push(p);
+ }
+ }
- stack.push(left);
- stack.push(p);
+ private static int partition(List list, Comparator super T> cmp, int p, int start, int end) {
+ int l = start;
+ int h = end - 2;
+ T piv = list.get(p);
+ swap(list, p, end - 1);
- }
- }
-
- private static int partition(List list, Comparator cmp, int p, int start, int end) {
- int l = start;
- int h = end - 2;
- T piv = list.get(p);
- swap(list,p,end-1);
+ while (l < h) {
+ if (cmp.compare(list.get(l), piv) <= 0) l++;
+ else if (cmp.compare(piv, list.get(h)) <= 0) h--;
+ else swap(list, l, h);
+ }
+ int idx = h;
+ if (cmp.compare(list.get(h), piv) < 0) idx++;
+ swap(list, end - 1, idx);
+ return idx;
+ }
- while (l < h) {
- if (cmp.compare(list.get(l), piv) <= 0) {
- l++;
- } else if (cmp.compare(piv, list.get(h)) <= 0) {
- h--;
- } else {
- swap(list,l,h);
- }
- }
- int idx = h;
- if (cmp.compare(list.get(h), piv) < 0) idx++;
- swap(list,end-1,idx);
- return idx;
- }
-
+ private static void swap(List list, int i, int j) {
+ T tmp = list.get(i);
+ list.set(i, list.get(j));
+ list.set(j, tmp);
+ }
- private static void swap(List list, int i, int j)
- {
- T tmp = list.get(i);
- list.set(i, list.get(j));
- list.set(j, tmp);
- }
+ @SuppressWarnings({ "rawtypes", "unchecked" })
+ private static final Comparator NATURAL_ORDER = new Comparator() {
+ @Override public int compare(Object l, Object r) { return ((Comparable) l).compareTo(r); }
+ };
+
+ @SuppressWarnings("unchecked")
+ private static > Comparator naturalOrder() {
+ return NATURAL_ORDER;
+ }
}
diff --git a/src/main/java/technology/tabula/Rectangle.java b/src/main/java/technology/tabula/Rectangle.java
index 4dc75298..95aebb8d 100644
--- a/src/main/java/technology/tabula/Rectangle.java
+++ b/src/main/java/technology/tabula/Rectangle.java
@@ -2,167 +2,191 @@
import java.awt.geom.Point2D;
import java.awt.geom.Rectangle2D;
+import java.util.Comparator;
import java.util.List;
+import java.util.Locale;
@SuppressWarnings("serial")
-public class Rectangle extends Rectangle2D.Float implements Comparable {
-
- private static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
-
- public Rectangle() {
- super();
- }
-
- public Rectangle(float top, float left, float width, float height) {
- super();
- this.setRect(left, top, width, height);
- }
-
- @Override
- public int compareTo(Rectangle other) {
- double thisBottom = this.getBottom();
- double otherBottom = other.getBottom();
- int rv;
-
- if (this.equals(other)) return 0;
-
- if (this.verticalOverlap(other) > VERTICAL_COMPARISON_THRESHOLD) {
- rv = java.lang.Double.compare(this.getX(), other.getX());
- }
- else {
- rv = java.lang.Double.compare(thisBottom, otherBottom);
- }
- return rv;
- }
-
-
-
- public float getArea() {
- return this.width * this.height;
- }
-
- public float verticalOverlap(Rectangle other) {
- return (float) Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
- }
-
- public boolean verticallyOverlaps(Rectangle other) {
- return verticalOverlap(other) > 0;
- }
-
- public float horizontalOverlap(Rectangle other) {
- return (float) Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
- }
-
- public boolean horizontallyOverlaps(Rectangle other) {
- return horizontalOverlap(other) > 0;
- }
-
- public float verticalOverlapRatio(Rectangle other) {
- float rv = 0,
- delta = (float) Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
-
- if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
- rv = (float) ((other.getBottom() - this.getTop()) / delta);
- }
- else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
- rv = (float) ((this.getBottom() - other.getTop()) / delta);
- }
- else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom() && other.getBottom() <= this.getBottom()) {
- rv = (float) ((other.getBottom() - other.getTop()) / delta);
- }
- else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom() && this.getBottom() <= other.getBottom()) {
- rv = (float) ((this.getBottom() - this.getTop()) / delta);
- }
-
- return rv;
-
- }
-
- public float overlapRatio(Rectangle other) {
- double intersectionWidth = Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
- double intersectionHeight = Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
- double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
- double unionArea = this.getArea() + other.getArea() - intersectionArea;
-
- return (float) (intersectionArea / unionArea);
- }
-
- public Rectangle merge(Rectangle other) {
- this.setRect(this.createUnion(other));
- return this;
- }
-
- public float getTop() {
- return (float) this.getMinY();
- }
-
- public void setTop(float top) {
- float deltaHeight = top - this.y;
- this.setRect(this.x, top, this.width, this.height - deltaHeight);
- }
-
- public float getRight() {
- return (float) this.getMaxX();
- }
-
- public void setRight(float right) {
- this.setRect(this.x, this.y, right - this.x, this.height);
- }
-
- public float getLeft() {
- return (float) this.getMinX();
- }
-
- public void setLeft(float left) {
- float deltaWidth = left - this.x;
- this.setRect(left, this.y, this.width - deltaWidth, this.height);
- }
-
- public float getBottom() {
- return (float) this.getMaxY();
- }
-
- public void setBottom(float bottom) {
- this.setRect(this.x, this.y, this.width, bottom - this.y);
- }
-
- public Point2D[] getPoints() {
- return new Point2D[] {
- new Point2D.Float((float) this.getLeft(), (float) this.getTop()),
- new Point2D.Float((float) this.getRight(), (float) this.getTop()),
- new Point2D.Float((float) this.getRight(), (float) this.getBottom()),
- new Point2D.Float((float) this.getLeft(), (float) this.getBottom())
- };
- }
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder();
- String s = super.toString();
- sb.append(s.substring(0, s.length() - 1));
- sb.append(String.format(",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
- return sb.toString();
- }
-
-
- /**
- * @param rectangles
- * @return minimum bounding box that contains all the rectangles
- */
- public static Rectangle boundingBoxOf(List extends Rectangle> rectangles) {
- float minx = java.lang.Float.MAX_VALUE;
- float miny = java.lang.Float.MAX_VALUE;
- float maxx = java.lang.Float.MIN_VALUE;
- float maxy = java.lang.Float.MIN_VALUE;
-
- for (Rectangle r: rectangles) {
- minx = (float) Math.min(r.getMinX(), minx);
- miny = (float) Math.min(r.getMinY(), miny);
- maxx = (float) Math.max(r.getMaxX(), maxx);
- maxy = (float) Math.max(r.getMaxY(), maxy);
- }
- return new Rectangle(miny, minx, maxx - minx, maxy - miny);
- }
-
+public class Rectangle extends Rectangle2D.Float {
+
+ /**
+ * Ill-defined comparator, from when Rectangle was Comparable.
+ *
+ * @see PR 116
+ * @deprecated with no replacement
+ */
+ @Deprecated
+ public static final Comparator ILL_DEFINED_ORDER = new Comparator() {
+ @Override public int compare(Rectangle o1, Rectangle o2) {
+ if (o1.equals(o2)) return 0;
+ if (o1.verticalOverlap(o2) > VERTICAL_COMPARISON_THRESHOLD) {
+ return o1.isLtrDominant() == -1 && o2.isLtrDominant() == -1
+ ? - java.lang.Double.compare(o1.getX(), o2.getX())
+ : java.lang.Double.compare(o1.getX(), o2.getX());
+ } else {
+ return java.lang.Float.compare(o1.getBottom(), o2.getBottom());
+ }
+ }
+ };
+
+ protected static final float VERTICAL_COMPARISON_THRESHOLD = 0.4f;
+
+ public Rectangle() {
+ super();
+ }
+
+ public Rectangle(float top, float left, float width, float height) {
+ super();
+ this.setRect(left, top, width, height);
+ }
+
+ public int compareTo(Rectangle other) {
+ return ILL_DEFINED_ORDER.compare(this, other);
+ }
+
+ // I'm bad at Java and need this for fancy sorting in
+ // technology.tabula.TextChunk.
+ public int isLtrDominant() {
+ return 0;
+ }
+
+ public float getArea() {
+ return this.width * this.height;
+ }
+
+ public float verticalOverlap(Rectangle other) {
+ return Math.max(0, Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
+ }
+
+ public boolean verticallyOverlaps(Rectangle other) {
+ return verticalOverlap(other) > 0;
+ }
+
+ public float verticalOverlapPercent(Rectangle other) {
+ float overlap = verticalOverlap(other);
+ return (overlap < 0) ? 0 : (float) (overlap / Math.max(getHeight(), other.getHeight()));
+ }
+
+ public float horizontalOverlap(Rectangle other) {
+ return Math.max(0, Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
+ }
+
+ public boolean horizontallyOverlaps(Rectangle other) {
+ return horizontalOverlap(other) > 0;
+ }
+
+ public float verticalOverlapRatio(Rectangle other) {
+ float rv = 0, delta = Math.min(this.getBottom() - this.getTop(), other.getBottom() - other.getTop());
+
+ if (other.getTop() <= this.getTop() && this.getTop() <= other.getBottom()
+ && other.getBottom() <= this.getBottom()) {
+ rv = (other.getBottom() - this.getTop()) / delta;
+ } else if (this.getTop() <= other.getTop() && other.getTop() <= this.getBottom()
+ && this.getBottom() <= other.getBottom()) {
+ rv = (this.getBottom() - other.getTop()) / delta;
+ } else if (this.getTop() <= other.getTop() && other.getTop() <= other.getBottom()
+ && other.getBottom() <= this.getBottom()) {
+ rv = (other.getBottom() - other.getTop()) / delta;
+ } else if (other.getTop() <= this.getTop() && this.getTop() <= this.getBottom()
+ && this.getBottom() <= other.getBottom()) {
+ rv = (this.getBottom() - this.getTop()) / delta;
+ }
+
+ return rv;
+
+ }
+
+ public float overlapRatio(Rectangle other) {
+ double intersectionWidth = Math.max(0,
+ Math.min(this.getRight(), other.getRight()) - Math.max(this.getLeft(), other.getLeft()));
+ double intersectionHeight = Math.max(0,
+ Math.min(this.getBottom(), other.getBottom()) - Math.max(this.getTop(), other.getTop()));
+ double intersectionArea = Math.max(0, intersectionWidth * intersectionHeight);
+ double unionArea = this.getArea() + other.getArea() - intersectionArea;
+
+ return (float) (intersectionArea / unionArea);
+ }
+
+ public Rectangle merge(Rectangle other) {
+ this.setRect(this.createUnion(other));
+ return this;
+ }
+
+ public float getTop() {
+ return (float) this.getMinY();
+ }
+
+ public void setTop(float top) {
+ float deltaHeight = top - this.y;
+ this.setRect(this.x, top, this.width, this.height - deltaHeight);
+ }
+
+ public float getRight() {
+ return (float) this.getMaxX();
+ }
+
+ public void setRight(float right) {
+ this.setRect(this.x, this.y, right - this.x, this.height);
+ }
+
+ public float getLeft() {
+ return (float) this.getMinX();
+ }
+
+ public void setLeft(float left) {
+ float deltaWidth = left - this.x;
+ this.setRect(left, this.y, this.width - deltaWidth, this.height);
+ }
+
+ public float getBottom() {
+ return (float) this.getMaxY();
+ }
+
+ public void setBottom(float bottom) {
+ this.setRect(this.x, this.y, this.width, bottom - this.y);
+ }
+
+ public Point2D[] getPoints() {
+ return new Point2D[] { new Point2D.Float(this.getLeft(), this.getTop()),
+ new Point2D.Float(this.getRight(), this.getTop()), new Point2D.Float(this.getRight(), this.getBottom()),
+ new Point2D.Float(this.getLeft(), this.getBottom()) };
+ }
+
+ public boolean almostContains(Rectangle other) {
+ Rectangle otherSmaller = new Rectangle();
+ float margin = (other.width > 10 && other.height > 10) ? 2f : 0f;
+ otherSmaller.setRect(other.x + margin, other.y + margin,
+ other.width - 2 * margin, other.height - 2 * margin);
+
+ return contains(otherSmaller);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ String s = super.toString();
+ sb.append(s.substring(0, s.length() - 1));
+ sb.append(String.format(Locale.US, ",bottom=%f,right=%f]", this.getBottom(), this.getRight()));
+ return sb.toString();
+ }
+
+ /**
+ * @param rectangles
+ * @return minimum bounding box that contains all the rectangles
+ */
+ public static Rectangle boundingBoxOf(List extends Rectangle> rectangles) {
+ float minx = java.lang.Float.MAX_VALUE;
+ float miny = java.lang.Float.MAX_VALUE;
+ float maxx = java.lang.Float.MIN_VALUE;
+ float maxy = java.lang.Float.MIN_VALUE;
+
+ for (Rectangle r : rectangles) {
+ minx = (float) Math.min(r.getMinX(), minx);
+ miny = (float) Math.min(r.getMinY(), miny);
+ maxx = (float) Math.max(r.getMaxX(), maxx);
+ maxy = (float) Math.max(r.getMaxY(), maxy);
+ }
+ return new Rectangle(miny, minx, maxx - minx, maxy - miny);
+ }
}
diff --git a/src/main/java/technology/tabula/RectangleSpatialIndex.java b/src/main/java/technology/tabula/RectangleSpatialIndex.java
index e3aa633e..4fba6162 100644
--- a/src/main/java/technology/tabula/RectangleSpatialIndex.java
+++ b/src/main/java/technology/tabula/RectangleSpatialIndex.java
@@ -1,88 +1,54 @@
package technology.tabula;
-import gnu.trove.procedure.TIntProcedure;
-
import java.util.ArrayList;
-import java.util.Collections;
import java.util.List;
-import net.sf.jsi.SpatialIndex;
-import net.sf.jsi.rtree.RTree;
+import org.locationtech.jts.geom.Envelope;
+import org.locationtech.jts.index.strtree.STRtree;
-class RectangleSpatialIndex {
+/***
+ * List that sorts rectangles is spacial order
+ */
+public class RectangleSpatialIndex {
- class SaveToListProcedure implements TIntProcedure {
- private List ids = new ArrayList();
- public boolean execute(int id) {
- ids.add(id);
- return true;
- };
-
- private List getIds() {
- return ids;
- }
- };
-
- private final SpatialIndex si;
- private final List rectangles;
- private Rectangle bounds = null;
-
- public RectangleSpatialIndex() {
- si = new RTree();
- si.init(null);
- rectangles = new ArrayList();
- }
-
+ private final STRtree si = new STRtree();
+ private final List rectangles = new ArrayList<>();
+
public void add(T te) {
rectangles.add(te);
- if (bounds == null) {
- bounds = new Rectangle();
- bounds.setRect(te);
- }
- else {
- bounds.merge(te);
- }
- si.add(rectangleToSpatialIndexRectangle(te), rectangles.size() - 1);
+ si.insert(new Envelope(te.getLeft(), te.getRight(), te.getBottom(), te.getTop()), te);
}
-
+
+ /**
+ * ? Get all rectangles in collection that are inside r.
+ */
public List contains(Rectangle r) {
- SaveToListProcedure proc = new SaveToListProcedure();
- si.contains(rectangleToSpatialIndexRectangle(r), proc);
- ArrayList rv = new ArrayList();
- for (int i : proc.getIds()) {
- rv.add(rectangles.get(i));
+ List intersection = si.query(new Envelope(r.getLeft(), r.getRight(), r.getTop(), r.getBottom()));
+ List rv = new ArrayList();
+
+ for (T ir: intersection) {
+ if (r.contains(ir)) {
+ rv.add(ir);
+ }
}
- Utils.sort(rv);
+
+ Utils.sort(rv, Rectangle.ILL_DEFINED_ORDER);
return rv;
}
public List intersects(Rectangle r) {
- SaveToListProcedure proc = new SaveToListProcedure();
- si.intersects(rectangleToSpatialIndexRectangle(r), proc);
- ArrayList rv = new ArrayList();
- for (int i : proc.getIds()) {
- rv.add(rectangles.get(i));
- }
- Utils.sort(rv);
+ List rv = si.query(new Envelope(r.getLeft(), r.getRight(), r.getTop(), r.getBottom()));
return rv;
}
- private net.sf.jsi.Rectangle rectangleToSpatialIndexRectangle(Rectangle r) {
- return new net.sf.jsi.Rectangle((float) r.getX(),
- (float) r.getY(),
- (float) (r.getX() + r.getWidth()),
- (float) (r.getY() + r.getHeight()));
- }
-
-
/**
* Minimum bounding box of all the Rectangles contained on this RectangleSpatialIndex
*
* @return a Rectangle
*/
public Rectangle getBounds() {
- return bounds;
+ return Rectangle.boundingBoxOf(rectangles);
}
}
diff --git a/src/main/java/technology/tabula/RectangularTextContainer.java b/src/main/java/technology/tabula/RectangularTextContainer.java
index f9e0036f..5f4d3716 100644
--- a/src/main/java/technology/tabula/RectangularTextContainer.java
+++ b/src/main/java/technology/tabula/RectangularTextContainer.java
@@ -5,31 +5,32 @@
@SuppressWarnings("serial")
public abstract class RectangularTextContainer extends Rectangle {
- public RectangularTextContainer(float top, float left, float width, float height) {
- super(top, left, width, height);
- }
-
- public String toString() {
- StringBuilder sb = new StringBuilder();
- String s = super.toString();
- sb.append(s.substring(0, s.length() - 1));
- sb.append(String.format(",text=%s]", this.getText() == null ? "null" : "\"" + this.getText() + "\""));
- return sb.toString();
- }
-
- public RectangularTextContainer merge(RectangularTextContainer other) {
- if (this.compareTo(other) < 0) {
- this.getTextElements().addAll(other.getTextElements());
-
- }
- else {
- this.getTextElements().addAll(0, other.getTextElements());
- }
- super.merge(other);
- return this;
- }
-
- public abstract String getText();
- public abstract String getText(boolean useLineReturns);
- public abstract List getTextElements();
+ public RectangularTextContainer(float top, float left, float width, float height) {
+ super(top, left, width, height);
+ }
+
+ public RectangularTextContainer merge(RectangularTextContainer other) {
+ if (compareTo(other) < 0) {
+ this.getTextElements().addAll(other.getTextElements());
+ } else {
+ this.getTextElements().addAll(0, other.getTextElements());
+ }
+ super.merge(other);
+ return this;
+ }
+
+ public abstract String getText();
+
+ public abstract String getText(boolean useLineReturns);
+
+ public abstract List getTextElements();
+
+ @Override public String toString() {
+ StringBuilder sb = new StringBuilder();
+ String s = super.toString();
+ sb.append(s.substring(0, s.length() - 1));
+ sb.append(String.format(",text=%s]", this.getText() == null ? "null" : "\"" + this.getText() + "\""));
+ return sb.toString();
+ }
+
}
diff --git a/src/main/java/technology/tabula/Ruling.java b/src/main/java/technology/tabula/Ruling.java
index caf5914a..549baddd 100644
--- a/src/main/java/technology/tabula/Ruling.java
+++ b/src/main/java/technology/tabula/Ruling.java
@@ -8,6 +8,7 @@
import java.util.Comparator;
import java.util.Formatter;
import java.util.List;
+import java.util.Locale;
import java.util.Map;
import java.util.TreeMap;
@@ -16,7 +17,7 @@ public class Ruling extends Line2D.Float {
private static int PERPENDICULAR_PIXEL_EXPAND_AMOUNT = 2;
private static int COLINEAR_OR_PARALLEL_PIXEL_EXPAND_AMOUNT = 1;
- private enum SOType { VERTICAL, HRIGHT, HLEFT };
+ private enum SOType { VERTICAL, HRIGHT, HLEFT }
public Ruling(float top, float left, float width, float height) {
this(new Point2D.Float(left, top), new Point2D.Float(left+width, top+height));
@@ -117,6 +118,20 @@ public void setEnd(float v) {
this.setRight(v);
}
}
+
+ private void setStartEnd(float start, float end) {
+ if (this.oblique()) {
+ throw new UnsupportedOperationException();
+ }
+ if (this.vertical()) {
+ this.setTop(start);
+ this.setBottom(end);
+ }
+ else {
+ this.setLeft(start);
+ this.setRight(end);
+ }
+ }
// -----
@@ -277,13 +292,13 @@ public double getAngle() {
public String toString() {
StringBuilder sb = new StringBuilder();
Formatter formatter = new Formatter(sb);
- String rv = formatter.format("%s[x1=%f y1=%f x2=%f y2=%f]", this.getClass().toString(), this.x1, this.y1, this.x2, this.y2).toString();
+ String rv = formatter.format(Locale.US, "%s[x1=%f y1=%f x2=%f y2=%f]", this.getClass().toString(), this.x1, this.y1, this.x2, this.y2).toString();
formatter.close();
return rv;
}
public static List cropRulingsToArea(List rulings, Rectangle2D area) {
- ArrayList rv = new ArrayList();
+ ArrayList rv = new ArrayList<>();
for (Ruling r : rulings) {
if (r.intersects(area)) {
rv.add(r.intersect(area));
@@ -308,15 +323,15 @@ public SortObject(SOType type, float position, Ruling ruling) {
}
}
- List sos = new ArrayList();
+ List sos = new ArrayList<>();
- TreeMap tree = new TreeMap(new Comparator() {
+ TreeMap tree = new TreeMap<>(new Comparator() {
@Override
public int compare(Ruling o1, Ruling o2) {
return java.lang.Double.compare(o1.getTop(), o2.getTop());
}});
- TreeMap rv = new TreeMap(new Comparator() {
+ TreeMap rv = new TreeMap<>(new Comparator() {
@Override
public int compare(Point2D o1, Point2D o2) {
if (o1.getY() > o2.getY()) return 1;
@@ -395,24 +410,40 @@ public static List collapseOrientedRulings(List lines) {
}
public static List collapseOrientedRulings(List lines, int expandAmount) {
- ArrayList rv = new ArrayList();
- if (lines.size() == 0) {
- return rv;
- }
+ ArrayList rv = new ArrayList<>();
Collections.sort(lines, new Comparator() {
@Override
public int compare(Ruling a, Ruling b) {
- return (int) (!Utils.feq(a.getPosition(), b.getPosition()) ? a.getPosition() - b.getPosition() : a.getStart() - b.getStart());
+ final float diff = a.getPosition() - b.getPosition();
+ return java.lang.Float.compare(diff == 0 ? a.getStart() - b.getStart() : diff, 0f);
}
});
-
- rv.add(lines.remove(0));
+
for (Ruling next_line : lines) {
- Ruling last = rv.get(rv.size() - 1);
+ Ruling last = rv.isEmpty() ? null : rv.get(rv.size() - 1);
+
+ float origNextLinePosition = next_line.getPosition();
+ if (last != null && Utils.feq(next_line.getPosition(), last.getPosition(), 2.0)) {
+ next_line.setPosition(last.getPosition());
+ }
+
// if current line colinear with next, and are "close enough": expand current line
- if (Utils.feq(next_line.getPosition(), last.getPosition()) && last.nearlyIntersects(next_line, expandAmount)) {
- last.setStart(next_line.getStart() < last.getStart() ? next_line.getStart() : last.getStart());
- last.setEnd(next_line.getEnd() < last.getEnd() ? last.getEnd() : next_line.getEnd());
+ if (last != null && Utils.feq(next_line.getPosition(), last.getPosition()) && last.nearlyIntersects(next_line, expandAmount)) {
+ next_line.setPosition(origNextLinePosition);
+ final float lastStart = last.getStart();
+ final float lastEnd = last.getEnd();
+
+ final boolean lastFlipped = lastStart > lastEnd;
+ final boolean nextFlipped = next_line.getStart() > next_line.getEnd();
+
+ boolean differentDirections = nextFlipped != lastFlipped;
+ float nextS = differentDirections ? next_line.getEnd() : next_line.getStart();
+ float nextE = differentDirections ? next_line.getStart() : next_line.getEnd();
+
+ final float newStart = lastFlipped ? Math.max(nextS, lastStart) : Math.min(nextS, lastStart);
+ final float newEnd = lastFlipped ? Math.min(nextE, lastEnd) : Math.max(nextE, lastEnd);
+ last.setStartEnd(newStart, newEnd);
+ assert !last.oblique();
}
else if (next_line.length() == 0) {
continue;
diff --git a/src/main/java/technology/tabula/Table.java b/src/main/java/technology/tabula/Table.java
index eda11251..3f971d0e 100644
--- a/src/main/java/technology/tabula/Table.java
+++ b/src/main/java/technology/tabula/Table.java
@@ -8,139 +8,98 @@
@SuppressWarnings("serial")
public class Table extends Rectangle {
-
- class CellPosition implements Comparable {
- int row, col;
- CellPosition(int row, int col) {
- this.row = row; this.col = col;
- }
-
- @Override
- public boolean equals(Object other) {
- if (this == other)
- return true;
- if (!(other instanceof CellPosition))
- return false;
- return other != null && this.row == ((CellPosition) other).row && this.col == ((CellPosition) other).col;
- }
-
- @Override
- public int hashCode() {
- return this.row * 100000 + this.col;
- }
-
- @Override
- public int compareTo(CellPosition other) {
- int rv = 0;
- if(this.row < other.row) {
- rv = -1;
- }
- else if (this.row > other.row) {
- rv = 1;
- }
- else if (this.col > other.col) {
- rv = 1;
- }
- else if (this.col < other.col) {
- rv = -1;
- }
- return rv;
- }
- }
-
- class CellContainer extends TreeMap {
-
- public int maxRow = 0, maxCol = 0;
-
- public RectangularTextContainer get(int row, int col) {
- return this.get(new CellPosition(row, col));
- }
-
- public List getRow(int row) {
- return new ArrayList(this.subMap(new CellPosition(row, 0), new CellPosition(row, maxRow+1)).values());
- }
-
- @Override
- public RectangularTextContainer put(CellPosition cp, RectangularTextContainer value) {
- this.maxRow = Math.max(maxRow, cp.row);
- this.maxCol = Math.max(maxCol, cp.col);
- if (this.containsKey(cp)) { // adding on an existing CellPosition, concatenate content and resize
- value.merge(this.get(cp));
- }
- super.put(cp, value);
- return value;
- }
-
- @Override
- public RectangularTextContainer get(Object key) {
- return this.containsKey(key) ? super.get(key) : TextChunk.EMPTY;
- }
-
- public boolean containsKey(int row, int col) {
- return this.containsKey(new CellPosition(row, col));
- }
-
- }
-
- public static final Table EMPTY = new Table();
-
- CellContainer cellContainer = new CellContainer();
- Page page;
- ExtractionAlgorithm extractionAlgorithm;
- List> rows = null;
-
- public Table() {
- super();
- }
-
- public Table(Page page, ExtractionAlgorithm extractionAlgorithm) {
- this();
- this.page = page;
- this.extractionAlgorithm = extractionAlgorithm;
- }
-
- public void add(RectangularTextContainer tc, int i, int j) {
- this.merge(tc);
- this.cellContainer.put(new CellPosition(i, j), tc);
- this.rows = null; // clear the memoized rows
- }
-
- public List> getRows() {
- if (this.rows != null) {
- return this.rows;
- }
-
- this.rows = new ArrayList>();
- for (int i = 0; i <= this.cellContainer.maxRow; i++) {
- List lastRow = new ArrayList();
- this.rows.add(lastRow);
- for (int j = 0; j <= this.cellContainer.maxCol; j++) {
- lastRow.add(this.cellContainer.containsKey(i, j) ? this.cellContainer.get(i, j) : TextChunk.EMPTY);
- }
- }
- return this.rows;
- }
-
- public RectangularTextContainer getCell(int i, int j) {
- return this.cellContainer.get(i, j);
- }
-
- public List> getCols() {
- return Utils.transpose(this.getRows());
- }
-
- public void setExtractionAlgorithm(ExtractionAlgorithm extractionAlgorithm) {
- this.extractionAlgorithm = extractionAlgorithm;
- }
-
- public ExtractionAlgorithm getExtractionAlgorithm() {
- return extractionAlgorithm;
- }
-
- public List getCells() {
- return (List