From 84aef7f0c84f4ec0fdb5adee789c93c4a216073c Mon Sep 17 00:00:00 2001 From: Martin Skopp Date: Wed, 7 Dec 2022 15:11:10 +0100 Subject: [PATCH 01/30] Add a simple API usage example applying SpreadsheetExtractionAlgorithm --- README.md | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8d3c8210..c3a5f92f 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ tabula-java [![Build Status](https://travis-ci.org/tabulapdf/tabula-java.svg?bra Download a version of the tabula-java's jar, with all dependencies included, that works on Mac, Windows and Linux from our [releases page](../../releases). -## Usage Examples +## Commandline Usage Examples `tabula-java` provides a command line application: @@ -81,6 +81,44 @@ JVM start-up time is a lot of the cost of the `tabula` command, so if you're try - writing your own program in any JVM language (Java, JRuby, Scala) that imports tabula-java. - waiting for us to implement an API/server-style system (it's on the [roadmap](https://github.com/tabulapdf/tabula-api)) +## API Usage Examples + +A simple Java code example which extracts all rows and cells from all tables of all pages of a PDF document: + + InputStream in = this.getClass().getResourceAsStream("my.pdf"); + try (PDDocument document = PDDocument.load(in)) { + SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); + PageIterator pi = new ObjectExtractor(document).extract(); + while (pi.hasNext()) { + // iterate over the pages of the document + Page page = pi.next(); + List table = sea.extract(page); + // iterate over the tables of the page + for(Table tables: table) { + List> rows = tables.getRows(); + // iterate over the rows of the table + for (List cells : rows) { + // print all column-cells of the row plus linefeed + for (RectangularTextContainer content : cells) { + // Note: Cell.getText() uses \r to concat text chunks + String text = content.getText().replace("\r", " "); + System.out.print(text + "|"); + } + System.out.println(); + } + } + } + } + +For more detail information check the Javadoc. +The Javadoc API documentation can be generated (see also '_Building from Source_' section) via + +``` +mvn javadoc:javadoc +``` + +which generates the HTML files to directory ```target/site/apidocs/``` + ## Building from Source Clone this repo and run: From 3c2af18f7c3daedafb6a4d33ab5f818cdc468d09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Aristar=C3=A1n?= Date: Wed, 7 Dec 2022 12:04:34 -0300 Subject: [PATCH 02/30] Fix Markdown formatting for code example --- README.md | 45 ++++++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index c3a5f92f..db7b0023 100644 --- a/README.md +++ b/README.md @@ -85,30 +85,33 @@ JVM start-up time is a lot of the cost of the `tabula` command, so if you're try A simple Java code example which extracts all rows and cells from all tables of all pages of a PDF document: - InputStream in = this.getClass().getResourceAsStream("my.pdf"); - try (PDDocument document = PDDocument.load(in)) { - SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); - PageIterator pi = new ObjectExtractor(document).extract(); - while (pi.hasNext()) { - // iterate over the pages of the document - Page page = pi.next(); - List
table = sea.extract(page); - // iterate over the tables of the page - for(Table tables: table) { - List> rows = tables.getRows(); - // iterate over the rows of the table - for (List cells : rows) { - // print all column-cells of the row plus linefeed - for (RectangularTextContainer content : cells) { - // Note: Cell.getText() uses \r to concat text chunks - String text = content.getText().replace("\r", " "); - System.out.print(text + "|"); - } - System.out.println(); - } +```java +InputStream in = this.getClass().getResourceAsStream("my.pdf"); +try (PDDocument document = PDDocument.load(in)) { + SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); + PageIterator pi = new ObjectExtractor(document).extract(); + while (pi.hasNext()) { + // iterate over the pages of the document + Page page = pi.next(); + List
table = sea.extract(page); + // iterate over the tables of the page + for(Table tables: table) { + List> rows = tables.getRows(); + // iterate over the rows of the table + for (List cells : rows) { + // print all column-cells of the row plus linefeed + for (RectangularTextContainer content : cells) { + // Note: Cell.getText() uses \r to concat text chunks + String text = content.getText().replace("\r", " "); + System.out.print(text + "|"); } + System.out.println(); } } + } +} +``` + For more detail information check the Javadoc. The Javadoc API documentation can be generated (see also '_Building from Source_' section) via From b0fde49e6aa06593d16c8aa0b8da0e3172db1ec2 Mon Sep 17 00:00:00 2001 From: Aki Ariga Date: Mon, 20 Feb 2023 18:17:19 -0800 Subject: [PATCH 03/30] Enforce checkout with LF Windows CI fails when parsing CSV with line breaks within a cell. This is due to the difference of line endings between CRLF vs LF, and test CSV parser implementation. To mitigate this issue, tweak Windows CI to enforce git checkout with LF. --- .github/workflows/tests-windows.yml | 11 ++++++++--- .github/workflows/tests.yml | 4 ++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/.github/workflows/tests-windows.yml b/.github/workflows/tests-windows.yml index 4ff7f542..5cc1031a 100644 --- a/.github/workflows/tests-windows.yml +++ b/.github/workflows/tests-windows.yml @@ -1,4 +1,4 @@ -name: Java CI +name: Java CI (Windows) on: [push] @@ -7,9 +7,14 @@ jobs: runs-on: windows-latest steps: - - uses: actions/checkout@v2 + # https://github.com/actions/checkout/issues/135#issuecomment-602171132 + - name: Set git to use LF + run: | + git config --global core.autocrlf false + git config --global core.eol lf + - uses: actions/checkout@v3 - name: Set up JDK 11 - uses: actions/setup-java@v2 + uses: actions/setup-java@v3 with: java-version: '11' distribution: 'adopt' diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b8aa9c14..da2d019b 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -7,9 +7,9 @@ jobs: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up JDK 11 - uses: actions/setup-java@v2 + uses: actions/setup-java@v3 with: java-version: '11' distribution: 'adopt' From 8bfa3ad23af34f757f72fe46584a34abfc022ed3 Mon Sep 17 00:00:00 2001 From: Tilman Hausherr Date: Fri, 14 Apr 2023 17:46:46 +0200 Subject: [PATCH 04/30] update pdfbox to 2.0.28 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index fb1f7e08..27a03e73 100644 --- a/pom.xml +++ b/pom.xml @@ -262,7 +262,7 @@ org.apache.pdfbox pdfbox - 2.0.26 + 2.0.28 From bc60be27d663f40e132a21407ec81790a8aee361 Mon Sep 17 00:00:00 2001 From: young Date: Wed, 6 Mar 2024 14:53:55 +0800 Subject: [PATCH 05/30] update pdfbox to 3.0.1 --- pom.xml | 2 +- .../technology/tabula/CommandLineApp.java | 3 +- .../java/technology/tabula/debug/Debug.java | 5 +- .../detectors/NurminenDetectionAlgorithm.java | 6 +- src/test/java/technology/tabula/TestCell.java | 3 +- src/test/java/technology/tabula/TestLine.java | 11 +- .../tabula/TestObjectExtractor.java | 21 +- .../tabula/TestProjectionProfile.java | 8 +- .../technology/tabula/TestTableDetection.java | 3 +- .../technology/tabula/TestTextElement.java | 393 +++++++++--------- .../java/technology/tabula/TestUtils.java | 3 +- .../technology/tabula/UtilsForTesting.java | 7 +- 12 files changed, 238 insertions(+), 227 deletions(-) diff --git a/pom.xml b/pom.xml index 27a03e73..6c71b426 100644 --- a/pom.xml +++ b/pom.xml @@ -262,7 +262,7 @@ org.apache.pdfbox pdfbox - 2.0.28 + 3.0.1 diff --git a/src/main/java/technology/tabula/CommandLineApp.java b/src/main/java/technology/tabula/CommandLineApp.java index 3a6773a9..1b422303 100644 --- a/src/main/java/technology/tabula/CommandLineApp.java +++ b/src/main/java/technology/tabula/CommandLineApp.java @@ -15,6 +15,7 @@ import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.DefaultParser; +import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import technology.tabula.detectors.DetectionAlgorithm; @@ -158,7 +159,7 @@ public void extractFileInto(File pdfFile, File outputFile) throws ParseException private void extractFile(File pdfFile, Appendable outFile) throws ParseException { PDDocument pdfDocument = null; try { - pdfDocument = this.password == null ? PDDocument.load(pdfFile) : PDDocument.load(pdfFile, this.password); + pdfDocument = this.password == null ? Loader.loadPDF(pdfFile) : Loader.loadPDF(pdfFile,password); PageIterator pageIterator = getPageIterator(pdfDocument); List
tables = new ArrayList<>(); diff --git a/src/main/java/technology/tabula/debug/Debug.java b/src/main/java/technology/tabula/debug/Debug.java index 91609045..d6d257ce 100644 --- a/src/main/java/technology/tabula/debug/Debug.java +++ b/src/main/java/technology/tabula/debug/Debug.java @@ -16,6 +16,7 @@ import java.util.List; import org.apache.commons.cli.*; +import org.apache.pdfbox.Loader; import technology.tabula.Cell; import technology.tabula.CommandLineApp; import technology.tabula.Line; @@ -215,7 +216,7 @@ public static void renderPage(String pdfPath, String outPath, int pageNumber, Re boolean drawColumns, boolean drawCharacters, boolean drawArea, boolean drawCells, boolean drawUnprocessedRulings, boolean drawProjectionProfile, boolean drawClippingPaths, boolean drawDetectedTables) throws IOException { - PDDocument document = PDDocument.load(new File(pdfPath)); + PDDocument document = Loader.loadPDF(new File(pdfPath)); ObjectExtractor oe = new ObjectExtractor(document); @@ -349,7 +350,7 @@ public static void main(String[] args) throws IOException { if (pages == null) { // user specified all pages - PDDocument document = PDDocument.load(pdfFile); + PDDocument document = Loader.loadPDF(pdfFile); int numPages = document.getNumberOfPages(); pages = new ArrayList<>(numPages); diff --git a/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java b/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java index fb43622a..9a377854 100644 --- a/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java +++ b/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java @@ -805,12 +805,12 @@ private PDDocument removeText(PDPage page) throws IOException { PDFStreamParser parser = new PDFStreamParser(page); parser.parse(); - List tokens = parser.getTokens(); List newTokens = new ArrayList<>(); - for (Object token : tokens) { + while (page.hasContents()) { + Object token = parser.parseNextToken(); if (token instanceof Operator) { Operator op = (Operator) token; - if (op.getName().equals("TJ") || op.getName().equals("Tj")) { + if ("TJ".equals(op.getName()) || "Tj".equals(op.getName())) { //remove the one argument to this operator newTokens.remove(newTokens.size() - 1); continue; diff --git a/src/test/java/technology/tabula/TestCell.java b/src/test/java/technology/tabula/TestCell.java index de1b8cb8..2795565c 100644 --- a/src/test/java/technology/tabula/TestCell.java +++ b/src/test/java/technology/tabula/TestCell.java @@ -6,6 +6,7 @@ import java.util.ArrayList; import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.Test; public class TestCell { @@ -31,7 +32,7 @@ public void testGetTextElements() { Cell cell = new Cell(0, 0, 0, 0); assertTrue(cell.getTextElements().isEmpty()); - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); List tList = new ArrayList<>(); tList.add(tChunk); diff --git a/src/test/java/technology/tabula/TestLine.java b/src/test/java/technology/tabula/TestLine.java index 90df0e31..f7a6a88d 100644 --- a/src/test/java/technology/tabula/TestLine.java +++ b/src/test/java/technology/tabula/TestLine.java @@ -6,6 +6,7 @@ import java.util.List; import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.Test; public class TestLine { @@ -14,7 +15,7 @@ public class TestLine { public void testSetTextElements() { Line line = new Line(); - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); List tList = new ArrayList<>(); tList.add(tChunk); @@ -28,7 +29,7 @@ public void testSetTextElements() { public void testAddTextChunkIntTextChunk() { Line line = new Line(); - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); line.addTextChunk(3, tChunk); @@ -39,7 +40,7 @@ public void testAddTextChunkIntTextChunk() { public void testLessThanAddTextChunkIntTextChunk() { Line line = new Line(); - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); line.addTextChunk(0, tChunk); line.addTextChunk(0, tChunk); @@ -51,7 +52,7 @@ public void testLessThanAddTextChunkIntTextChunk() { public void testErrorAddTextChunkIntTextChunk() { Line line = new Line(); - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0,new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); line.addTextChunk(-1, tChunk); } @@ -60,7 +61,7 @@ public void testErrorAddTextChunkIntTextChunk() { public void testToString() { Line line = new Line(); - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); line.addTextChunk(0, tChunk); line.addTextChunk(0, tChunk); diff --git a/src/test/java/technology/tabula/TestObjectExtractor.java b/src/test/java/technology/tabula/TestObjectExtractor.java index 9db7ad18..69864c61 100644 --- a/src/test/java/technology/tabula/TestObjectExtractor.java +++ b/src/test/java/technology/tabula/TestObjectExtractor.java @@ -7,6 +7,7 @@ import java.util.ArrayList; import java.util.List; +import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.junit.Test; @@ -21,7 +22,7 @@ public void testWrongPasswordRaisesException() throws IOException { @Test(expected = IOException.class) public void testEmptyOnEncryptedFileRaisesException() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/encrypted.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/encrypted.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { oe.extract().next(); } @@ -29,7 +30,7 @@ public void testEmptyOnEncryptedFileRaisesException() throws IOException { @Test public void testCanReadPDFWithOwnerEncryption() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { PageIterator pi = oe.extract(); int i = 0; @@ -44,7 +45,7 @@ public void testCanReadPDFWithOwnerEncryption() throws IOException { @Test public void testGoodPassword() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/encrypted.pdf"), "userpassword"); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/encrypted.pdf"), "userpassword"); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { List pages = new ArrayList<>(); PageIterator pi = oe.extract(); @@ -58,7 +59,7 @@ public void testGoodPassword() throws IOException { @Test public void testTextExtractionDoesNotRaise() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/rotated_page.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/rotated_page.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { PageIterator pi = oe.extract(); @@ -70,7 +71,7 @@ public void testTextExtractionDoesNotRaise() throws IOException { @Test public void testShouldDetectRulings() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/should_detect_rulings.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/should_detect_rulings.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { PageIterator pi = oe.extract(); @@ -85,7 +86,7 @@ public void testShouldDetectRulings() throws IOException { @Test public void testDontThrowNPEInShfill() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/labor.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/labor.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { PageIterator pi = oe.extract(); @@ -101,7 +102,7 @@ public void testDontThrowNPEInShfill() throws IOException { @Test public void testExtractOnePage() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); assertEquals(2, pdf_document.getNumberOfPages()); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { @@ -114,7 +115,7 @@ public void testExtractOnePage() throws IOException { @Test(expected = IndexOutOfBoundsException.class) public void testExtractWrongPageNumber() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); assertEquals(2, pdf_document.getNumberOfPages()); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { @@ -124,7 +125,7 @@ public void testExtractWrongPageNumber() throws IOException { @Test public void testTextElementsContainedInPage() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/cs-en-us-pbms.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/cs-en-us-pbms.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { Page page = oe.extractPage(1); @@ -137,7 +138,7 @@ public void testTextElementsContainedInPage() throws IOException { } @Test public void testDoNotNPEInPointComparator() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/npe_issue_206.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/npe_issue_206.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { Page p = oe.extractPage(1); diff --git a/src/test/java/technology/tabula/TestProjectionProfile.java b/src/test/java/technology/tabula/TestProjectionProfile.java index e7af882f..e6d93b39 100644 --- a/src/test/java/technology/tabula/TestProjectionProfile.java +++ b/src/test/java/technology/tabula/TestProjectionProfile.java @@ -8,6 +8,7 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.Before; import org.junit.Test; @@ -20,9 +21,10 @@ public class TestProjectionProfile { public void setUpProjectionProfile() { PDPage pdPage = new PDPage(); PDDocument pdDocument = new PDDocument(); - - TextElement textElement = new TextElement(5f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "test", 1f); - TextElement textElement2 = new TextElement(5f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "test", 1f); + + PDType1Font font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + TextElement textElement = new TextElement(5f, 15f, 10f, 20f, font, 1f, "test", 1f); + TextElement textElement2 = new TextElement(5f, 15f, 10f, 20f, font, 1f, "test", 1f); List textList = new ArrayList<>(); textList.add(textElement); textList.add(textElement2); diff --git a/src/test/java/technology/tabula/TestTableDetection.java b/src/test/java/technology/tabula/TestTableDetection.java index 6e58f6a4..80d21350 100644 --- a/src/test/java/technology/tabula/TestTableDetection.java +++ b/src/test/java/technology/tabula/TestTableDetection.java @@ -11,6 +11,7 @@ import static org.junit.Assert.*; import com.google.gson.Gson; +import org.apache.pdfbox.Loader; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; @@ -162,7 +163,7 @@ public void testDetectionOfTables() throws Exception { NodeList tables = regionDocument.getElementsByTagName("table"); // tabula extractors - PDDocument pdfDocument = PDDocument.load(this.pdf); + PDDocument pdfDocument = Loader.loadPDF(this.pdf); ObjectExtractor extractor = new ObjectExtractor(pdfDocument); // parse expected tables from the ground truth dataset diff --git a/src/test/java/technology/tabula/TestTextElement.java b/src/test/java/technology/tabula/TestTextElement.java index feaaa5e6..3db1ca31 100644 --- a/src/test/java/technology/tabula/TestTextElement.java +++ b/src/test/java/technology/tabula/TestTextElement.java @@ -3,205 +3,208 @@ import java.util.ArrayList; import java.util.List; +import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.Assert; import org.junit.Test; public class TestTextElement { - - - @Test - public void createTextElement() { - - TextElement textElement = new TextElement(5f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f); - - Assert.assertNotNull(textElement); - Assert.assertEquals("A", textElement.getText()); - Assert.assertEquals(1f, textElement.getFontSize(), 0); - Assert.assertEquals(15f, textElement.getLeft(), 0); - Assert.assertEquals(5f, textElement.getTop(), 0); - Assert.assertEquals(10f, textElement.getWidth(), 0); - Assert.assertEquals(20f, textElement.getHeight(), 0); - Assert.assertEquals(PDType1Font.HELVETICA, textElement.getFont()); - Assert.assertEquals(1f, textElement.getWidthOfSpace(), 0); - Assert.assertEquals(0f, textElement.getDirection(), 0); - - - } - - @Test - public void createTextElementWithDirection() { - - TextElement textElement = new TextElement(5f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f); - - Assert.assertNotNull(textElement); - Assert.assertEquals("A", textElement.getText()); - Assert.assertEquals(1f, textElement.getFontSize(), 0); - Assert.assertEquals(15f, textElement.getLeft(), 0); - Assert.assertEquals(5f, textElement.getTop(), 0); - Assert.assertEquals(10f, textElement.getWidth(), 0); - Assert.assertEquals(20f, textElement.getHeight(), 0); - Assert.assertEquals(PDType1Font.HELVETICA, textElement.getFont()); - Assert.assertEquals(1f, textElement.getWidthOfSpace(), 0); - Assert.assertEquals(6f, textElement.getDirection(), 0); - - - } - - @Test - public void mergeFourElementsIntoFourWords() { - - List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(20f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - elements.add(new TextElement(40f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - elements.add(new TextElement(60f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - - List words = TextElement.mergeWords(elements); - - List expectedWords = new ArrayList<>(); - expectedWords.add(new TextChunk(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f))); - expectedWords.add(new TextChunk(new TextElement(20f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f))); - expectedWords.add(new TextChunk(new TextElement(40f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f))); - expectedWords.add(new TextChunk(new TextElement(60f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f))); - - Assert.assertEquals(expectedWords, words); - - } - - @Test - public void mergeFourElementsIntoOneWord() { - - List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - elements.add(new TextElement(0f, 35f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - elements.add(new TextElement(0f, 45f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - - List words = TextElement.mergeWords(elements); - - List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - textChunk.add(new TextElement(0f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - textChunk.add(new TextElement(0f, 35f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - textChunk.add(new TextElement(0f, 45f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - expectedWords.add(textChunk); - - Assert.assertEquals(expectedWords, words); - - } - - @Test - public void mergeElementsShouldBeIdempotent() { - /* - * a bug in TextElement.merge_words would delete the first TextElement in the array - * it was called with. Discussion here: https://github.com/tabulapdf/tabula-java/issues/78 - */ - - List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - elements.add(new TextElement(0f, 35f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - elements.add(new TextElement(0f, 45f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - - List words = TextElement.mergeWords(elements); - List words2 = TextElement.mergeWords(elements); - Assert.assertEquals(words, words2); - } - - @Test - public void mergeElementsWithSkippingRules() { - - List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 17f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - elements.add(new TextElement(0.001f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, " ", 1f, 6f)); - elements.add(new TextElement(0f, 35f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - elements.add(new TextElement(0f, 45f, 10f, 20f, PDType1Font.TIMES_ROMAN, 10f, "D", 1f, 6f)); - - List words = TextElement.mergeWords(elements); - - List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - textChunk.add(new TextElement(0f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - textChunk.add(new TextElement(0f, 35f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - textChunk.add(new TextElement(0f, 45f, 10f, 20f, PDType1Font.TIMES_ROMAN, 10f, "D", 1f, 6f)); - expectedWords.add(textChunk); - - Assert.assertEquals(expectedWords, words); - - } - - @Test - public void mergeTenElementsIntoTwoWords() { - - List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "H", 1f, 6f)); - elements.add(new TextElement(0f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - elements.add(new TextElement(0f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "L", 1f, 6f)); - elements.add(new TextElement(0f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 60f, 10f, 20f, PDType1Font.HELVETICA, 1f, "M", 1f, 6f)); - elements.add(new TextElement(0f, 70f, 10f, 20f, PDType1Font.HELVETICA, 1f, "U", 1f, 6f)); - elements.add(new TextElement(0f, 80f, 10f, 20f, PDType1Font.HELVETICA, 1f, "N", 1f, 6f)); - elements.add(new TextElement(0f, 90f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - elements.add(new TextElement(0f, 100f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - - List words = TextElement.mergeWords(elements); - - List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "H", 1f, 6f)); - textChunk.add(new TextElement(0f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - textChunk.add(new TextElement(0f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "L", 1f, 6f)); - textChunk.add(new TextElement(0f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - textChunk.add(new TextElement(0f, 30f, 10.5f, 20f, PDType1Font.HELVETICA, 1f, " ", 1f)); //Check why width=10.5? - expectedWords.add(textChunk); - TextChunk textChunk2 = new TextChunk(new TextElement(0f, 60f, 10f, 20f, PDType1Font.HELVETICA, 1f, "M", 1f, 6f)); - textChunk2.add(new TextElement(0f, 70f, 10f, 20f, PDType1Font.HELVETICA, 1f, "U", 1f, 6f)); - textChunk2.add(new TextElement(0f, 80f, 10f, 20f, PDType1Font.HELVETICA, 1f, "N", 1f, 6f)); - textChunk2.add(new TextElement(0f, 90f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - textChunk2.add(new TextElement(0f, 100f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - expectedWords.add(textChunk2); - - Assert.assertEquals(2, words.size()); - Assert.assertEquals(expectedWords, words); - - } - - @Test - public void mergeTenElementsIntoTwoLines() { - - List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "H", 1f, 6f)); - elements.add(new TextElement(0f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - elements.add(new TextElement(0f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "L", 1f, 6f)); - elements.add(new TextElement(0f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(20f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "M", 1f, 6f)); - elements.add(new TextElement(20f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "U", 1f, 6f)); - elements.add(new TextElement(20f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "N", 1f, 6f)); - elements.add(new TextElement(20f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - elements.add(new TextElement(20f, 40f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - - List words = TextElement.mergeWords(elements); - - List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "H", 1f, 6f)); - textChunk.add(new TextElement(0f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - textChunk.add(new TextElement(0f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "L", 1f, 6f)); - textChunk.add(new TextElement(0f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - expectedWords.add(textChunk); - TextChunk textChunk2 = new TextChunk(new TextElement(20f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "M", 1f, 6f)); - textChunk2.add(new TextElement(20f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "U", 1f, 6f)); - textChunk2.add(new TextElement(20f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "N", 1f, 6f)); - textChunk2.add(new TextElement(20f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - textChunk2.add(new TextElement(20f, 40f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - expectedWords.add(textChunk2); - - Assert.assertEquals(2, words.size()); - Assert.assertEquals(expectedWords, words); - - } - - + + + @Test + public void createTextElement() { + + TextElement textElement = new TextElement(5f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f); + + Assert.assertNotNull(textElement); + Assert.assertEquals("A", textElement.getText()); + Assert.assertEquals(1f, textElement.getFontSize(), 0); + Assert.assertEquals(15f, textElement.getLeft(), 0); + Assert.assertEquals(5f, textElement.getTop(), 0); + Assert.assertEquals(10f, textElement.getWidth(), 0); + Assert.assertEquals(20f, textElement.getHeight(), 0); + Assert.assertEquals(new PDType1Font(Standard14Fonts.FontName.HELVETICA), textElement.getFont()); + Assert.assertEquals(1f, textElement.getWidthOfSpace(), 0); + Assert.assertEquals(0f, textElement.getDirection(), 0); + + + } + + @Test + public void createTextElementWithDirection() { + + TextElement textElement = new TextElement(5f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f); + + Assert.assertNotNull(textElement); + Assert.assertEquals("A", textElement.getText()); + Assert.assertEquals(1f, textElement.getFontSize(), 0); + Assert.assertEquals(15f, textElement.getLeft(), 0); + Assert.assertEquals(5f, textElement.getTop(), 0); + Assert.assertEquals(10f, textElement.getWidth(), 0); + Assert.assertEquals(20f, textElement.getHeight(), 0); + Assert.assertEquals(new PDType1Font(Standard14Fonts.FontName.HELVETICA), textElement.getFont()); + Assert.assertEquals(1f, textElement.getWidthOfSpace(), 0); + Assert.assertEquals(6f, textElement.getDirection(), 0); + + + } + + @Test + public void mergeFourElementsIntoFourWords() { + + List elements = new ArrayList<>(); + elements.add(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + elements.add(new TextElement(20f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); + elements.add(new TextElement(40f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); + elements.add(new TextElement(60f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + + List words = TextElement.mergeWords(elements); + + List expectedWords = new ArrayList<>(); + expectedWords.add(new TextChunk(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(20f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(40f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(60f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f))); + + Assert.assertEquals(expectedWords, words); + + } + + @Test + public void mergeFourElementsIntoOneWord() { + + List elements = new ArrayList<>(); + elements.add(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); + elements.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); + elements.add(new TextElement(0f, 45f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + + List words = TextElement.mergeWords(elements); + + List expectedWords = new ArrayList<>(); + TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + textChunk.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); + textChunk.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); + textChunk.add(new TextElement(0f, 45f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + expectedWords.add(textChunk); + + Assert.assertEquals(expectedWords, words); + + } + + @Test + public void mergeElementsShouldBeIdempotent() { + /* + * a bug in TextElement.merge_words would delete the first TextElement in the array + * it was called with. Discussion here: https://github.com/tabulapdf/tabula-java/issues/78 + */ + + List elements = new ArrayList<>(); + elements.add(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); + elements.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); + elements.add(new TextElement(0f, 45f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + + List words = TextElement.mergeWords(elements); + List words2 = TextElement.mergeWords(elements); + Assert.assertEquals(words, words2); + } + + @Test + public void mergeElementsWithSkippingRules() { + + List elements = new ArrayList<>(); + elements.add(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 17f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); + elements.add(new TextElement(0.001f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, " ", 1f, 6f)); + elements.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); + PDFont TIMES_ROMAN = new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN); + elements.add(new TextElement(0f, 45f, 10f, 20f, TIMES_ROMAN, 10f, "D", 1f, 6f)); + + List words = TextElement.mergeWords(elements); + + List expectedWords = new ArrayList<>(); + TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + textChunk.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); + textChunk.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); + textChunk.add(new TextElement(0f, 45f, 10f, 20f, TIMES_ROMAN, 10f, "D", 1f, 6f)); + expectedWords.add(textChunk); + + Assert.assertEquals(expectedWords, words); + + } + + @Test + public void mergeTenElementsIntoTwoWords() { + + List elements = new ArrayList<>(); + elements.add(new TextElement(0f, 0f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "H", 1f, 6f)); + elements.add(new TextElement(0f, 10f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "O", 1f, 6f)); + elements.add(new TextElement(0f, 20f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "L", 1f, 6f)); + elements.add(new TextElement(0f, 30f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 60f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "M", 1f, 6f)); + elements.add(new TextElement(0f, 70f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "U", 1f, 6f)); + elements.add(new TextElement(0f, 80f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "N", 1f, 6f)); + elements.add(new TextElement(0f, 90f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + elements.add(new TextElement(0f, 100f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "O", 1f, 6f)); + + List words = TextElement.mergeWords(elements); + + List expectedWords = new ArrayList<>(); + TextChunk textChunk = new TextChunk(new TextElement(0f, 0f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "H", 1f, 6f)); + textChunk.add(new TextElement(0f, 10f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "O", 1f, 6f)); + textChunk.add(new TextElement(0f, 20f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "L", 1f, 6f)); + textChunk.add(new TextElement(0f, 30f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + textChunk.add(new TextElement(0f, 30f, 10.5f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, " ", 1f)); //Check why width=10.5? + expectedWords.add(textChunk); + TextChunk textChunk2 = new TextChunk(new TextElement(0f, 60f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "M", 1f, 6f)); + textChunk2.add(new TextElement(0f, 70f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "U", 1f, 6f)); + textChunk2.add(new TextElement(0f, 80f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "N", 1f, 6f)); + textChunk2.add(new TextElement(0f, 90f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + textChunk2.add(new TextElement(0f, 100f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "O", 1f, 6f)); + expectedWords.add(textChunk2); + + Assert.assertEquals(2, words.size()); + Assert.assertEquals(expectedWords, words); + + } + + @Test + public void mergeTenElementsIntoTwoLines() { + + List elements = new ArrayList<>(); + PDType1Font font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + elements.add(new TextElement(0f, 0f, 10f, 20f, font, 1f, "H", 1f, 6f)); + elements.add(new TextElement(0f, 10f, 10f, 20f, font, 1f, "O", 1f, 6f)); + elements.add(new TextElement(0f, 20f, 10f, 20f, font, 1f, "L", 1f, 6f)); + elements.add(new TextElement(0f, 30f, 10f, 20f, font, 1f, "A", 1f, 6f)); + elements.add(new TextElement(20f, 0f, 10f, 20f, font, 1f, "M", 1f, 6f)); + elements.add(new TextElement(20f, 10f, 10f, 20f, font, 1f, "U", 1f, 6f)); + elements.add(new TextElement(20f, 20f, 10f, 20f, font, 1f, "N", 1f, 6f)); + elements.add(new TextElement(20f, 30f, 10f, 20f, font, 1f, "D", 1f, 6f)); + elements.add(new TextElement(20f, 40f, 10f, 20f, font, 1f, "O", 1f, 6f)); + + List words = TextElement.mergeWords(elements); + + List expectedWords = new ArrayList<>(); + TextChunk textChunk = new TextChunk(new TextElement(0f, 0f, 10f, 20f, font, 1f, "H", 1f, 6f)); + textChunk.add(new TextElement(0f, 10f, 10f, 20f, font, 1f, "O", 1f, 6f)); + textChunk.add(new TextElement(0f, 20f, 10f, 20f, font, 1f, "L", 1f, 6f)); + textChunk.add(new TextElement(0f, 30f, 10f, 20f, font, 1f, "A", 1f, 6f)); + expectedWords.add(textChunk); + TextChunk textChunk2 = new TextChunk(new TextElement(20f, 0f, 10f, 20f, font, 1f, "M", 1f, 6f)); + textChunk2.add(new TextElement(20f, 10f, 10f, 20f, font, 1f, "U", 1f, 6f)); + textChunk2.add(new TextElement(20f, 20f, 10f, 20f, font, 1f, "N", 1f, 6f)); + textChunk2.add(new TextElement(20f, 30f, 10f, 20f, font, 1f, "D", 1f, 6f)); + textChunk2.add(new TextElement(20f, 40f, 10f, 20f, font, 1f, "O", 1f, 6f)); + expectedWords.add(textChunk2); + + Assert.assertEquals(2, words.size()); + Assert.assertEquals(expectedWords, words); + + } + } diff --git a/src/test/java/technology/tabula/TestUtils.java b/src/test/java/technology/tabula/TestUtils.java index e68411df..cb85cb7b 100644 --- a/src/test/java/technology/tabula/TestUtils.java +++ b/src/test/java/technology/tabula/TestUtils.java @@ -12,6 +12,7 @@ import java.util.Collections; import java.util.List; +import org.apache.pdfbox.Loader; import org.apache.pdfbox.rendering.ImageType; import org.apache.commons.cli.ParseException; import org.apache.pdfbox.pdmodel.PDDocument; @@ -122,7 +123,7 @@ public void testQuickSortLongList() { @Test public void testJPEG2000DoesNotRaise() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/jpeg2000.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/jpeg2000.pdf")); PDPage page = pdf_document.getPage(0); Utils.pageConvertToImage(pdf_document, page, 360, ImageType.RGB); } diff --git a/src/test/java/technology/tabula/UtilsForTesting.java b/src/test/java/technology/tabula/UtilsForTesting.java index 3ee8efde..8d3c91cf 100644 --- a/src/test/java/technology/tabula/UtilsForTesting.java +++ b/src/test/java/technology/tabula/UtilsForTesting.java @@ -7,6 +7,7 @@ import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVPrinter; +import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.junit.Assert; @@ -23,11 +24,9 @@ public static Page getAreaFromPage(String path, int page, float top, float left, public static Page getPage(String path, int pageNumber) throws IOException { ObjectExtractor oe = null; try { - PDDocument document = PDDocument - .load(new File(path)); + PDDocument document = Loader.loadPDF(new File(path)); oe = new ObjectExtractor(document); - Page page = oe.extract(pageNumber); - return page; + return oe.extract(pageNumber); } finally { if (oe != null) oe.close(); From d0241fb5ff9182d7980c3ccd572cc8bb2dba9357 Mon Sep 17 00:00:00 2001 From: young Date: Wed, 6 Mar 2024 14:59:35 +0800 Subject: [PATCH 06/30] remove useless variable --- .../tabula/detectors/SpreadsheetDetectionAlgorithm.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/java/technology/tabula/detectors/SpreadsheetDetectionAlgorithm.java b/src/main/java/technology/tabula/detectors/SpreadsheetDetectionAlgorithm.java index 243cc3bf..43136ba5 100644 --- a/src/main/java/technology/tabula/detectors/SpreadsheetDetectionAlgorithm.java +++ b/src/main/java/technology/tabula/detectors/SpreadsheetDetectionAlgorithm.java @@ -20,8 +20,6 @@ public class SpreadsheetDetectionAlgorithm implements DetectionAlgorithm { public List detect(Page page) { List cells = SpreadsheetExtractionAlgorithm.findCells(page.getHorizontalRulings(), page.getVerticalRulings()); - SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); - List tables = SpreadsheetExtractionAlgorithm.findSpreadsheetsFromCells(cells); // we want tables to be returned from top to bottom on the page From 63de16a4e102b44ea370919625221561dc783e75 Mon Sep 17 00:00:00 2001 From: young Date: Thu, 11 Apr 2024 14:09:16 +0800 Subject: [PATCH 07/30] exclude junit-jupiter from pdfbox --- pom.xml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pom.xml b/pom.xml index 6c71b426..f60528d0 100644 --- a/pom.xml +++ b/pom.xml @@ -263,6 +263,12 @@ org.apache.pdfbox pdfbox 3.0.1 + + + org.junit.jupiter + junit-jupiter + + From e0ee0728ca398023ab67f59626a55525de0355b0 Mon Sep 17 00:00:00 2001 From: young Date: Thu, 11 Apr 2024 14:31:10 +0800 Subject: [PATCH 08/30] update pdfbox to 3.0.2 --- pom.xml | 341 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 168 insertions(+), 173 deletions(-) diff --git a/pom.xml b/pom.xml index f60528d0..52943fbf 100644 --- a/pom.xml +++ b/pom.xml @@ -1,4 +1,5 @@ - + 4.0.0 technology.tabula tabula @@ -33,16 +34,16 @@ - - snapshots - https://repository.apache.org/content/repositories/snapshots/ - - false - - - true - - + + snapshots + https://repository.apache.org/content/repositories/snapshots/ + + false + + + true + + @@ -109,20 +110,20 @@ - org.apache.maven.plugins - maven-javadoc-plugin - 3.3.1 - - 8 - - - - attach-javadocs - - jar - - - + org.apache.maven.plugins + maven-javadoc-plugin + 3.3.1 + + 8 + + + + attach-javadocs + + jar + + + org.apache.maven.plugins @@ -159,172 +160,166 @@ technology.tabula.CommandLineApp - - - jar-with-dependencies - + + + jar-with-dependencies + - - - org.apache.maven.plugins - maven-surefire-plugin - 2.22.2 - - - -Xms1024m -Xmx2048m - - - - - org.apache.maven.plugins - maven-eclipse-plugin - 2.10 - - true - true - - - - - - - - release - - + org.apache.maven.plugins - maven-javadoc-plugin - 3.3.1 + maven-surefire-plugin + 2.22.2 - 8 + + -Xms1024m -Xmx2048m - - - attach-javadocs - - jar - - - - - org.apache.maven.plugins - maven-source-plugin - 3.2.1 - - - attach-sources - - jar-no-fork - - - - - - org.apache.maven.plugins - maven-gpg-plugin - 1.6 - - - sign-artifacts - verify - - sign - - - - - - - - + + + org.apache.maven.plugins + maven-eclipse-plugin + 2.10 + + true + true + + + + + + + + release + + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.3.1 + + 8 + + + + attach-javadocs + + jar + + + + + + org.apache.maven.plugins + maven-source-plugin + 3.2.1 + + + attach-sources + + jar-no-fork + + + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.6 + + + sign-artifacts + verify + + sign + + + + + + + + - - - org.locationtech.jts - jts-core - 1.18.1 - + + + org.locationtech.jts + jts-core + 1.18.1 + - - org.slf4j - slf4j-api - 1.7.35 - + + org.slf4j + slf4j-api + 1.7.35 + - - org.slf4j - slf4j-simple - 1.7.32 - + + org.slf4j + slf4j-simple + 1.7.32 + - - org.apache.pdfbox - pdfbox - 3.0.1 - - - org.junit.jupiter - junit-jupiter - - - + + org.apache.pdfbox + pdfbox + 3.0.2 + - - org.bouncycastle - bcprov-jdk15on - 1.70 - + + org.bouncycastle + bcprov-jdk15on + 1.70 + - - org.bouncycastle - bcmail-jdk15on - 1.70 - + + org.bouncycastle + bcmail-jdk15on + 1.70 + - - junit - junit - 4.13.2 - test - + + junit + junit + 4.13.2 + test + - - commons-cli - commons-cli - 1.4 - + + commons-cli + commons-cli + 1.4 + - - org.apache.commons - commons-csv - 1.9.0 - + + org.apache.commons + commons-csv + 1.9.0 + - - com.google.code.gson - gson - 2.9.0 - + + com.google.code.gson + gson + 2.9.0 + - - com.github.jai-imageio - jai-imageio-core - 1.4.0 - + + com.github.jai-imageio + jai-imageio-core + 1.4.0 + - - com.github.jai-imageio - jai-imageio-jpeg2000 - 1.4.0 - + + com.github.jai-imageio + jai-imageio-jpeg2000 + 1.4.0 + - - org.apache.pdfbox - jbig2-imageio - 3.0.4 - - + + org.apache.pdfbox + jbig2-imageio + 3.0.4 + + From 20b1053a24402a1e3a587ee90211661027d66484 Mon Sep 17 00:00:00 2001 From: young Date: Mon, 29 Apr 2024 17:40:46 +0800 Subject: [PATCH 09/30] fix: oom for removeText --- .../detectors/NurminenDetectionAlgorithm.java | 86 ++++++++++++------- .../technology/tabula/TestTableDetection.java | 53 ++++++------ 2 files changed, 79 insertions(+), 60 deletions(-) diff --git a/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java b/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java index 9a377854..86639f66 100644 --- a/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java +++ b/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java @@ -1,14 +1,8 @@ package technology.tabula.detectors; -import java.awt.geom.Line2D; -import java.awt.geom.Point2D; -import java.awt.image.BufferedImage; -import java.awt.image.Raster; -import java.io.IOException; -import java.io.OutputStream; -import java.util.*; - +import org.apache.pdfbox.contentstream.PDContentStream; import org.apache.pdfbox.contentstream.operator.Operator; +import org.apache.pdfbox.contentstream.operator.OperatorName; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdfparser.PDFStreamParser; import org.apache.pdfbox.pdfwriter.ContentStreamWriter; @@ -16,16 +10,17 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.rendering.ImageType; - -import technology.tabula.Line; -import technology.tabula.Page; -import technology.tabula.Rectangle; -import technology.tabula.Ruling; -import technology.tabula.TextChunk; -import technology.tabula.TextElement; -import technology.tabula.Utils; +import technology.tabula.*; import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; +import java.awt.geom.Line2D; +import java.awt.geom.Point2D; +import java.awt.image.BufferedImage; +import java.awt.image.Raster; +import java.io.IOException; +import java.io.OutputStream; +import java.util.*; + /** * Created by matt on 2015-12-17. *

@@ -799,25 +794,10 @@ private List getVerticalRulings(BufferedImage image) { return verticalRulings; } - - // taken from http://www.docjar.com/html/api/org/apache/pdfbox/examples/util/RemoveAllText.java.html private PDDocument removeText(PDPage page) throws IOException { PDFStreamParser parser = new PDFStreamParser(page); parser.parse(); - List newTokens = new ArrayList<>(); - while (page.hasContents()) { - Object token = parser.parseNextToken(); - if (token instanceof Operator) { - Operator op = (Operator) token; - if ("TJ".equals(op.getName()) || "Tj".equals(op.getName())) { - //remove the one argument to this operator - newTokens.remove(newTokens.size() - 1); - continue; - } - } - newTokens.add(token); - } PDDocument document = new PDDocument(); PDPage newPage = document.importPage(page); @@ -826,9 +806,51 @@ private PDDocument removeText(PDPage page) throws IOException { PDStream newContents = new PDStream(document); OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE); ContentStreamWriter writer = new ContentStreamWriter(out); - writer.writeTokens(newTokens); + List tokensWithoutText = createTokensWithoutText(page); + writer.writeTokens(tokensWithoutText); out.close(); newPage.setContents(newContents); return document; } + + + /** + * @param contentStream contentStream + * @return newTokens + * @throws IOException When parseNextToken on Error + * @see ... + */ + private static List createTokensWithoutText(PDContentStream contentStream) throws IOException { + PDFStreamParser parser = new PDFStreamParser(contentStream); + Object token = parser.parseNextToken(); + List newTokens = new ArrayList<>(); + while (token != null) { + if (token instanceof Operator) { + Operator op = (Operator) token; + String opName = op.getName(); + if (OperatorName.SHOW_TEXT_ADJUSTED.equals(opName) + || OperatorName.SHOW_TEXT.equals(opName) + || OperatorName.SHOW_TEXT_LINE.equals(opName)) { + // remove the argument to this operator + newTokens.remove(newTokens.size() - 1); + + token = parser.parseNextToken(); + continue; + } else if (OperatorName.SHOW_TEXT_LINE_AND_SPACE.equals(opName)) { + // remove the 3 arguments to this operator + newTokens.remove(newTokens.size() - 1); + newTokens.remove(newTokens.size() - 1); + newTokens.remove(newTokens.size() - 1); + + token = parser.parseNextToken(); + continue; + } + } + newTokens.add(token); + token = parser.parseNextToken(); + } + return newTokens; + } + + } diff --git a/src/test/java/technology/tabula/TestTableDetection.java b/src/test/java/technology/tabula/TestTableDetection.java index 80d21350..c13ff201 100644 --- a/src/test/java/technology/tabula/TestTableDetection.java +++ b/src/test/java/technology/tabula/TestTableDetection.java @@ -1,29 +1,29 @@ package technology.tabula; -import java.io.File; -import java.io.FileWriter; -import java.io.FilenameFilter; -import java.io.IOException; -import java.util.*; -import java.util.logging.Level; -import java.util.logging.Logger; - -import static org.junit.Assert.*; - import com.google.gson.Gson; import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import org.w3c.dom.*; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; +import technology.tabula.detectors.NurminenDetectionAlgorithm; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.*; +import java.util.logging.Level; +import java.util.logging.Logger; -import org.apache.pdfbox.pdmodel.PDDocument; -import technology.tabula.detectors.NurminenDetectionAlgorithm; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; /** * Created by matt on 2015-12-14. @@ -111,15 +111,10 @@ public static Collection data() { String directoryName = "src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-" + regionCode + "/"; File dir = new File(directoryName); - File[] pdfs = dir.listFiles(new FilenameFilter() { - @Override - public boolean accept(File dir, String name) { - return name.toLowerCase().endsWith(".pdf"); - } - }); + File[] pdfs = dir.listFiles((dir1, name) -> name.toLowerCase().endsWith(".pdf")); for (File pdf : pdfs) { - data.add(new Object[] {pdf}); + data.add(new Object[]{pdf}); } } @@ -163,6 +158,8 @@ public void testDetectionOfTables() throws Exception { NodeList tables = regionDocument.getElementsByTagName("table"); // tabula extractors + + PDDocument pdfDocument = Loader.loadPDF(this.pdf); ObjectExtractor extractor = new ObjectExtractor(pdfDocument); @@ -171,7 +168,7 @@ public void testDetectionOfTables() throws Exception { int numExpectedTables = 0; - for (int i=0; i tablesOnPage = detectionAlgorithm.detect(page); - if (tablesOnPage.size() > 0) { - detectedTables.put(new Integer(page.getPageNumber()), tablesOnPage); + if (!tablesOnPage.isEmpty()) { + detectedTables.put(page.getPageNumber(), tablesOnPage); } } @@ -267,7 +264,7 @@ public void testDetectionOfTables() throws Exception { System.out.println(totalErroneouslyDetectedTables + " tables incorrectly detected"); - if(this.status.isFirstRun()) { + if (this.status.isFirstRun()) { // make the baseline this.status.expectedFailure = failed; this.status.numCorrectlyDetectedTables = this.numCorrectlyDetectedTables; @@ -293,14 +290,14 @@ private List comparePages(Integer page, List detected, List detectedIterator = detected.iterator(); detectedIterator.hasNext();) { + for (Iterator detectedIterator = detected.iterator(); detectedIterator.hasNext(); ) { Rectangle detectedTable = detectedIterator.next(); - for (int i=0; i Date: Mon, 29 Apr 2024 17:51:40 +0800 Subject: [PATCH 10/30] fix: unit test --- .../technology/tabula/TestTextElement.java | 109 +++++++++--------- 1 file changed, 57 insertions(+), 52 deletions(-) diff --git a/src/test/java/technology/tabula/TestTextElement.java b/src/test/java/technology/tabula/TestTextElement.java index 3db1ca31..ee0fbf3d 100644 --- a/src/test/java/technology/tabula/TestTextElement.java +++ b/src/test/java/technology/tabula/TestTextElement.java @@ -1,14 +1,14 @@ package technology.tabula; -import java.util.ArrayList; -import java.util.List; - import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.Assert; import org.junit.Test; +import java.util.ArrayList; +import java.util.List; + public class TestTextElement { @@ -24,7 +24,7 @@ public void createTextElement() { Assert.assertEquals(5f, textElement.getTop(), 0); Assert.assertEquals(10f, textElement.getWidth(), 0); Assert.assertEquals(20f, textElement.getHeight(), 0); - Assert.assertEquals(new PDType1Font(Standard14Fonts.FontName.HELVETICA), textElement.getFont()); + Assert.assertEquals(Standard14Fonts.FontName.HELVETICA.getName(), textElement.getFont().getName()); Assert.assertEquals(1f, textElement.getWidthOfSpace(), 0); Assert.assertEquals(0f, textElement.getDirection(), 0); @@ -43,7 +43,7 @@ public void createTextElementWithDirection() { Assert.assertEquals(5f, textElement.getTop(), 0); Assert.assertEquals(10f, textElement.getWidth(), 0); Assert.assertEquals(20f, textElement.getHeight(), 0); - Assert.assertEquals(new PDType1Font(Standard14Fonts.FontName.HELVETICA), textElement.getFont()); + Assert.assertEquals(Standard14Fonts.FontName.HELVETICA.getName(), textElement.getFont().getName()); Assert.assertEquals(1f, textElement.getWidthOfSpace(), 0); Assert.assertEquals(6f, textElement.getDirection(), 0); @@ -54,18 +54,19 @@ public void createTextElementWithDirection() { public void mergeFourElementsIntoFourWords() { List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - elements.add(new TextElement(20f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); - elements.add(new TextElement(40f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); - elements.add(new TextElement(60f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + PDType1Font font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + elements.add(new TextElement(0f, 15f, 10f, 20f, font, 1f, "A", 1f, 6f)); + elements.add(new TextElement(20f, 15f, 10f, 20f, font, 1f, "B", 1f, 6f)); + elements.add(new TextElement(40f, 15f, 10f, 20f, font, 1f, "C", 1f, 6f)); + elements.add(new TextElement(60f, 15f, 10f, 20f, font, 1f, "D", 1f, 6f)); List words = TextElement.mergeWords(elements); List expectedWords = new ArrayList<>(); - expectedWords.add(new TextChunk(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f))); - expectedWords.add(new TextChunk(new TextElement(20f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f))); - expectedWords.add(new TextChunk(new TextElement(40f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f))); - expectedWords.add(new TextChunk(new TextElement(60f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(0f, 15f, 10f, 20f, font, 1f, "A", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(20f, 15f, 10f, 20f, font, 1f, "B", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(40f, 15f, 10f, 20f, font, 1f, "C", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(60f, 15f, 10f, 20f, font, 1f, "D", 1f, 6f))); Assert.assertEquals(expectedWords, words); @@ -75,18 +76,19 @@ public void mergeFourElementsIntoFourWords() { public void mergeFourElementsIntoOneWord() { List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); - elements.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); - elements.add(new TextElement(0f, 45f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + PDType1Font font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + elements.add(new TextElement(0f, 15f, 10f, 20f, font, 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 25f, 10f, 20f, font, 1f, "B", 1f, 6f)); + elements.add(new TextElement(0f, 35f, 10f, 20f, font, 1f, "C", 1f, 6f)); + elements.add(new TextElement(0f, 45f, 10f, 20f, font, 1f, "D", 1f, 6f)); List words = TextElement.mergeWords(elements); List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - textChunk.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); - textChunk.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); - textChunk.add(new TextElement(0f, 45f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, font, 1f, "A", 1f, 6f)); + textChunk.add(new TextElement(0f, 25f, 10f, 20f, font, 1f, "B", 1f, 6f)); + textChunk.add(new TextElement(0f, 35f, 10f, 20f, font, 1f, "C", 1f, 6f)); + textChunk.add(new TextElement(0f, 45f, 10f, 20f, font, 1f, "D", 1f, 6f)); expectedWords.add(textChunk); Assert.assertEquals(expectedWords, words); @@ -101,10 +103,11 @@ public void mergeElementsShouldBeIdempotent() { */ List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); - elements.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); - elements.add(new TextElement(0f, 45f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + PDType1Font font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + elements.add(new TextElement(0f, 15f, 10f, 20f, font, 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 25f, 10f, 20f, font, 1f, "B", 1f, 6f)); + elements.add(new TextElement(0f, 35f, 10f, 20f, font, 1f, "C", 1f, 6f)); + elements.add(new TextElement(0f, 45f, 10f, 20f, font, 1f, "D", 1f, 6f)); List words = TextElement.mergeWords(elements); List words2 = TextElement.mergeWords(elements); @@ -115,20 +118,21 @@ public void mergeElementsShouldBeIdempotent() { public void mergeElementsWithSkippingRules() { List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 17f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); - elements.add(new TextElement(0.001f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, " ", 1f, 6f)); - elements.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); + PDType1Font font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + elements.add(new TextElement(0f, 15f, 10f, 20f, font, 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 17f, 10f, 20f, font, 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 25f, 10f, 20f, font, 1f, "B", 1f, 6f)); + elements.add(new TextElement(0.001f, 25f, 10f, 20f, font, 1f, " ", 1f, 6f)); + elements.add(new TextElement(0f, 35f, 10f, 20f, font, 1f, "C", 1f, 6f)); PDFont TIMES_ROMAN = new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN); elements.add(new TextElement(0f, 45f, 10f, 20f, TIMES_ROMAN, 10f, "D", 1f, 6f)); List words = TextElement.mergeWords(elements); List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - textChunk.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); - textChunk.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); + TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, font, 1f, "A", 1f, 6f)); + textChunk.add(new TextElement(0f, 25f, 10f, 20f, font, 1f, "B", 1f, 6f)); + textChunk.add(new TextElement(0f, 35f, 10f, 20f, font, 1f, "C", 1f, 6f)); textChunk.add(new TextElement(0f, 45f, 10f, 20f, TIMES_ROMAN, 10f, "D", 1f, 6f)); expectedWords.add(textChunk); @@ -140,30 +144,31 @@ public void mergeElementsWithSkippingRules() { public void mergeTenElementsIntoTwoWords() { List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 0f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "H", 1f, 6f)); - elements.add(new TextElement(0f, 10f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "O", 1f, 6f)); - elements.add(new TextElement(0f, 20f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "L", 1f, 6f)); - elements.add(new TextElement(0f, 30f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 60f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "M", 1f, 6f)); - elements.add(new TextElement(0f, 70f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "U", 1f, 6f)); - elements.add(new TextElement(0f, 80f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "N", 1f, 6f)); - elements.add(new TextElement(0f, 90f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); - elements.add(new TextElement(0f, 100f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "O", 1f, 6f)); + PDType1Font font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + elements.add(new TextElement(0f, 0f, 10f, 20f, font, 1f, "H", 1f, 6f)); + elements.add(new TextElement(0f, 10f, 10f, 20f, font, 1f, "O", 1f, 6f)); + elements.add(new TextElement(0f, 20f, 10f, 20f, font, 1f, "L", 1f, 6f)); + elements.add(new TextElement(0f, 30f, 10f, 20f, font, 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 60f, 10f, 20f, font, 1f, "M", 1f, 6f)); + elements.add(new TextElement(0f, 70f, 10f, 20f, font, 1f, "U", 1f, 6f)); + elements.add(new TextElement(0f, 80f, 10f, 20f, font, 1f, "N", 1f, 6f)); + elements.add(new TextElement(0f, 90f, 10f, 20f, font, 1f, "D", 1f, 6f)); + elements.add(new TextElement(0f, 100f, 10f, 20f, font, 1f, "O", 1f, 6f)); List words = TextElement.mergeWords(elements); List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 0f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "H", 1f, 6f)); - textChunk.add(new TextElement(0f, 10f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "O", 1f, 6f)); - textChunk.add(new TextElement(0f, 20f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "L", 1f, 6f)); - textChunk.add(new TextElement(0f, 30f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - textChunk.add(new TextElement(0f, 30f, 10.5f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, " ", 1f)); //Check why width=10.5? + TextChunk textChunk = new TextChunk(new TextElement(0f, 0f, 10f, 20f, font, 1f, "H", 1f, 6f)); + textChunk.add(new TextElement(0f, 10f, 10f, 20f, font, 1f, "O", 1f, 6f)); + textChunk.add(new TextElement(0f, 20f, 10f, 20f, font, 1f, "L", 1f, 6f)); + textChunk.add(new TextElement(0f, 30f, 10f, 20f, font, 1f, "A", 1f, 6f)); + textChunk.add(new TextElement(0f, 30f, 10.5f, 20f, font, 1f, " ", 1f)); //Check why width=10.5? expectedWords.add(textChunk); - TextChunk textChunk2 = new TextChunk(new TextElement(0f, 60f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "M", 1f, 6f)); - textChunk2.add(new TextElement(0f, 70f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "U", 1f, 6f)); - textChunk2.add(new TextElement(0f, 80f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "N", 1f, 6f)); - textChunk2.add(new TextElement(0f, 90f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); - textChunk2.add(new TextElement(0f, 100f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "O", 1f, 6f)); + TextChunk textChunk2 = new TextChunk(new TextElement(0f, 60f, 10f, 20f, font, 1f, "M", 1f, 6f)); + textChunk2.add(new TextElement(0f, 70f, 10f, 20f, font, 1f, "U", 1f, 6f)); + textChunk2.add(new TextElement(0f, 80f, 10f, 20f, font, 1f, "N", 1f, 6f)); + textChunk2.add(new TextElement(0f, 90f, 10f, 20f, font, 1f, "D", 1f, 6f)); + textChunk2.add(new TextElement(0f, 100f, 10f, 20f, font, 1f, "O", 1f, 6f)); expectedWords.add(textChunk2); Assert.assertEquals(2, words.size()); From 6d59cddd5e4523d74aa03739be5992d35372fdd3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Feb 2023 21:12:28 +0000 Subject: [PATCH 11/30] Bump maven-compiler-plugin from 3.8.1 to 3.11.0 Bumps [maven-compiler-plugin](https://github.com/apache/maven-compiler-plugin) from 3.8.1 to 3.11.0. - [Release notes](https://github.com/apache/maven-compiler-plugin/releases) - [Commits](https://github.com/apache/maven-compiler-plugin/compare/maven-compiler-plugin-3.8.1...maven-compiler-plugin-3.11.0) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-compiler-plugin dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 52943fbf..b3344e12 100644 --- a/pom.xml +++ b/pom.xml @@ -147,7 +147,7 @@ maven-compiler-plugin - 3.8.1 + 3.11.0 1.8 1.8 From 2bdeb954675cb2ad05431210d3f06db74a490fe9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:29:09 +0000 Subject: [PATCH 12/30] Bump org.apache.maven.plugins:maven-gpg-plugin from 1.6 to 3.2.4 Bumps [org.apache.maven.plugins:maven-gpg-plugin](https://github.com/apache/maven-gpg-plugin) from 1.6 to 3.2.4. - [Release notes](https://github.com/apache/maven-gpg-plugin/releases) - [Commits](https://github.com/apache/maven-gpg-plugin/compare/maven-gpg-plugin-1.6...maven-gpg-plugin-3.2.4) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-gpg-plugin dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index b3344e12..7f30e7a4 100644 --- a/pom.xml +++ b/pom.xml @@ -128,7 +128,7 @@ org.apache.maven.plugins maven-gpg-plugin - 1.6 + 3.2.4 sign-artifacts @@ -225,7 +225,7 @@ org.apache.maven.plugins maven-gpg-plugin - 1.6 + 3.2.4 sign-artifacts From c831cf6ac36c5315b96ff6a49212bb67908ce48e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:28:58 +0000 Subject: [PATCH 13/30] Bump commons-cli:commons-cli from 1.4 to 1.8.0 Bumps commons-cli:commons-cli from 1.4 to 1.8.0. --- updated-dependencies: - dependency-name: commons-cli:commons-cli dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 7f30e7a4..eb362e89 100644 --- a/pom.xml +++ b/pom.xml @@ -288,7 +288,7 @@ commons-cli commons-cli - 1.4 + 1.8.0 From 9dc64f867a01e69e6e929feaa5a909c02b9bd3e9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:28:56 +0000 Subject: [PATCH 14/30] Bump org.slf4j:slf4j-api from 1.7.35 to 2.0.13 Bumps org.slf4j:slf4j-api from 1.7.35 to 2.0.13. --- updated-dependencies: - dependency-name: org.slf4j:slf4j-api dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index eb362e89..adf29ce5 100644 --- a/pom.xml +++ b/pom.xml @@ -251,7 +251,7 @@ org.slf4j slf4j-api - 1.7.35 + 2.0.13 From 3f7445380ec4f48dfc545dd6d33e89d4c501af55 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:28:55 +0000 Subject: [PATCH 15/30] Bump org.slf4j:slf4j-simple from 1.7.32 to 2.0.13 Bumps org.slf4j:slf4j-simple from 1.7.32 to 2.0.13. --- updated-dependencies: - dependency-name: org.slf4j:slf4j-simple dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index adf29ce5..8e0736c4 100644 --- a/pom.xml +++ b/pom.xml @@ -257,7 +257,7 @@ org.slf4j slf4j-simple - 1.7.32 + 2.0.13 From 2ef079f2a14dc6d66c68c5ce8d03853eea7436f4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 21 Jun 2022 21:32:54 +0000 Subject: [PATCH 16/30] Bump jts-core from 1.18.1 to 1.19.0 Bumps jts-core from 1.18.1 to 1.19.0. --- updated-dependencies: - dependency-name: org.locationtech.jts:jts-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 8e0736c4..19bf0133 100644 --- a/pom.xml +++ b/pom.xml @@ -245,7 +245,7 @@ org.locationtech.jts jts-core - 1.18.1 + 1.19.0 From c1e4e326eddc1a2dfe59febf24a569d11bde5cfb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 15 Feb 2023 21:59:29 +0000 Subject: [PATCH 17/30] Bump maven-javadoc-plugin from 3.3.1 to 3.5.0 Bumps [maven-javadoc-plugin](https://github.com/apache/maven-javadoc-plugin) from 3.3.1 to 3.5.0. - [Release notes](https://github.com/apache/maven-javadoc-plugin/releases) - [Commits](https://github.com/apache/maven-javadoc-plugin/compare/maven-javadoc-plugin-3.3.1...maven-javadoc-plugin-3.5.0) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-javadoc-plugin dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/pom.xml b/pom.xml index 19bf0133..749fa0b7 100644 --- a/pom.xml +++ b/pom.xml @@ -74,7 +74,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.3.1 + 3.5.0 true @@ -110,20 +110,20 @@ - org.apache.maven.plugins - maven-javadoc-plugin - 3.3.1 - - 8 - - - - attach-javadocs - - jar - - - + org.apache.maven.plugins + maven-javadoc-plugin + 3.5.0 + + 8 + + + + attach-javadocs + + jar + + + org.apache.maven.plugins From 5761334b86f58723e761b4941f2950d7b6e53d82 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:29:03 +0000 Subject: [PATCH 18/30] Bump org.sonatype.plugins:nexus-staging-maven-plugin from 1.6.8 to 1.7.0 Bumps org.sonatype.plugins:nexus-staging-maven-plugin from 1.6.8 to 1.7.0. --- updated-dependencies: - dependency-name: org.sonatype.plugins:nexus-staging-maven-plugin dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 749fa0b7..6a66162f 100644 --- a/pom.xml +++ b/pom.xml @@ -87,7 +87,7 @@ org.sonatype.plugins nexus-staging-maven-plugin - 1.6.8 + 1.7.0 true ossrh From ab7c4bd54bd20ca03c2bfad400c5cc6c26e34d59 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:28:53 +0000 Subject: [PATCH 19/30] Bump org.apache.maven.plugins:maven-source-plugin from 3.2.1 to 3.3.1 Bumps [org.apache.maven.plugins:maven-source-plugin](https://github.com/apache/maven-source-plugin) from 3.2.1 to 3.3.1. - [Release notes](https://github.com/apache/maven-source-plugin/releases) - [Commits](https://github.com/apache/maven-source-plugin/compare/maven-source-plugin-3.2.1...maven-source-plugin-3.3.1) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-source-plugin dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 6a66162f..77cda400 100644 --- a/pom.xml +++ b/pom.xml @@ -99,7 +99,7 @@ org.apache.maven.plugins maven-source-plugin - 3.2.1 + 3.3.1 attach-sources @@ -212,7 +212,7 @@ org.apache.maven.plugins maven-source-plugin - 3.2.1 + 3.3.1 attach-sources From ebe8e30dedfd6f7553046bbe6bbd3640b121d3dd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 17 Jul 2024 21:36:59 +0000 Subject: [PATCH 20/30] Bump org.apache.commons:commons-csv from 1.9.0 to 1.11.0 Bumps [org.apache.commons:commons-csv](https://github.com/apache/commons-csv) from 1.9.0 to 1.11.0. - [Changelog](https://github.com/apache/commons-csv/blob/master/RELEASE-NOTES.txt) - [Commits](https://github.com/apache/commons-csv/compare/rel/commons-csv-1.9.0...rel/commons-csv-1.11.0) --- updated-dependencies: - dependency-name: org.apache.commons:commons-csv dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 77cda400..a45e6089 100644 --- a/pom.xml +++ b/pom.xml @@ -294,7 +294,7 @@ org.apache.commons commons-csv - 1.9.0 + 1.11.0 From db3f6dfd74801c824efd2a25dc26b4a3cb8d7922 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 17 Jul 2024 21:36:45 +0000 Subject: [PATCH 21/30] Bump org.apache.maven.plugins:maven-compiler-plugin Bumps [org.apache.maven.plugins:maven-compiler-plugin](https://github.com/apache/maven-compiler-plugin) from 3.11.0 to 3.13.0. - [Release notes](https://github.com/apache/maven-compiler-plugin/releases) - [Commits](https://github.com/apache/maven-compiler-plugin/compare/maven-compiler-plugin-3.11.0...maven-compiler-plugin-3.13.0) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-compiler-plugin dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index a45e6089..b73d8b1e 100644 --- a/pom.xml +++ b/pom.xml @@ -147,7 +147,7 @@ maven-compiler-plugin - 3.11.0 + 3.13.0 1.8 1.8 From fd3a32c579f672ba17c5f1231985e980c4e3ec4e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 17 Jul 2024 21:36:48 +0000 Subject: [PATCH 22/30] Bump com.google.code.gson:gson from 2.9.0 to 2.11.0 Bumps [com.google.code.gson:gson](https://github.com/google/gson) from 2.9.0 to 2.11.0. - [Release notes](https://github.com/google/gson/releases) - [Changelog](https://github.com/google/gson/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/gson/compare/gson-parent-2.9.0...gson-parent-2.11.0) --- updated-dependencies: - dependency-name: com.google.code.gson:gson dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index b73d8b1e..394ea68f 100644 --- a/pom.xml +++ b/pom.xml @@ -300,7 +300,7 @@ com.google.code.gson gson - 2.9.0 + 2.11.0 From 097559d0a185ca1dda25d7b7ff103e884848c70c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 17 Jul 2024 21:36:52 +0000 Subject: [PATCH 23/30] Bump org.apache.maven.plugins:maven-javadoc-plugin from 3.3.1 to 3.7.0 Bumps [org.apache.maven.plugins:maven-javadoc-plugin](https://github.com/apache/maven-javadoc-plugin) from 3.3.1 to 3.7.0. - [Release notes](https://github.com/apache/maven-javadoc-plugin/releases) - [Commits](https://github.com/apache/maven-javadoc-plugin/compare/maven-javadoc-plugin-3.3.1...maven-javadoc-plugin-3.7.0) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-javadoc-plugin dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 394ea68f..a4871012 100644 --- a/pom.xml +++ b/pom.xml @@ -74,7 +74,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.5.0 + 3.7.0 true @@ -112,7 +112,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.5.0 + 3.7.0 8 @@ -196,7 +196,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.3.1 + 3.7.0 8 From bde6d765cfab25d53ff885de33a4556fc41bb9d7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 17 Jul 2024 21:36:55 +0000 Subject: [PATCH 24/30] Bump org.apache.maven.plugins:maven-surefire-plugin from 2.22.2 to 3.3.1 Bumps [org.apache.maven.plugins:maven-surefire-plugin](https://github.com/apache/maven-surefire) from 2.22.2 to 3.3.1. - [Release notes](https://github.com/apache/maven-surefire/releases) - [Commits](https://github.com/apache/maven-surefire/compare/surefire-2.22.2...surefire-3.3.1) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-surefire-plugin dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index a4871012..8fd27509 100644 --- a/pom.xml +++ b/pom.xml @@ -169,7 +169,7 @@ org.apache.maven.plugins maven-surefire-plugin - 2.22.2 + 3.3.1 -Xms1024m -Xmx2048m From 0c73e698b979a74cac0e917718b2c5dfd098dacc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 22 Jul 2024 21:03:37 +0000 Subject: [PATCH 25/30] Bump org.apache.maven.plugins:maven-javadoc-plugin from 3.7.0 to 3.8.0 Bumps [org.apache.maven.plugins:maven-javadoc-plugin](https://github.com/apache/maven-javadoc-plugin) from 3.7.0 to 3.8.0. - [Release notes](https://github.com/apache/maven-javadoc-plugin/releases) - [Commits](https://github.com/apache/maven-javadoc-plugin/compare/maven-javadoc-plugin-3.7.0...maven-javadoc-plugin-3.8.0) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-javadoc-plugin dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 8fd27509..d0b40101 100644 --- a/pom.xml +++ b/pom.xml @@ -74,7 +74,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.7.0 + 3.8.0 true @@ -112,7 +112,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.7.0 + 3.8.0 8 @@ -196,7 +196,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.7.0 + 3.8.0 8 From 818c9a2f5a5ea8dc72d3efa775f192381e84b8c1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 9 Aug 2024 21:53:01 +0000 Subject: [PATCH 26/30] Bump org.apache.pdfbox:pdfbox from 3.0.2 to 3.0.3 Bumps org.apache.pdfbox:pdfbox from 3.0.2 to 3.0.3. --- updated-dependencies: - dependency-name: org.apache.pdfbox:pdfbox dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index d0b40101..a963e35a 100644 --- a/pom.xml +++ b/pom.xml @@ -263,7 +263,7 @@ org.apache.pdfbox pdfbox - 3.0.2 + 3.0.3 From 5d91f1d733c4895d31854a641c152220f8c5f341 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 30 Aug 2024 21:39:59 +0000 Subject: [PATCH 27/30] Bump org.locationtech.jts:jts-core from 1.19.0 to 1.20.0 Bumps org.locationtech.jts:jts-core from 1.19.0 to 1.20.0. --- updated-dependencies: - dependency-name: org.locationtech.jts:jts-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index a963e35a..49057e90 100644 --- a/pom.xml +++ b/pom.xml @@ -245,7 +245,7 @@ org.locationtech.jts jts-core - 1.19.0 + 1.20.0 From 971ae765e84f09ed83f5808b66f764590146e923 Mon Sep 17 00:00:00 2001 From: Kyle Lacy Date: Thu, 20 Feb 2025 15:29:09 -0800 Subject: [PATCH 28/30] Upgrade BouncyCastle dependencies --- pom.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 49057e90..8b7b3b2d 100644 --- a/pom.xml +++ b/pom.xml @@ -268,14 +268,14 @@ org.bouncycastle - bcprov-jdk15on - 1.70 + bcprov-jdk18on + 1.80 org.bouncycastle - bcmail-jdk15on - 1.70 + bcmail-jdk18on + 1.80 From 88154e2c15967cc4c2a2606a8da25d47b9b916c3 Mon Sep 17 00:00:00 2001 From: Tilman Hausherr Date: Wed, 19 Mar 2025 15:36:11 +0100 Subject: [PATCH 29/30] Update PDFBox --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 8b7b3b2d..211d0d4d 100644 --- a/pom.xml +++ b/pom.xml @@ -263,7 +263,7 @@ org.apache.pdfbox pdfbox - 3.0.3 + 3.0.4 From 2cdf3b4fd3f7e921dca8cc6814cdd9316be40f0f Mon Sep 17 00:00:00 2001 From: Tilman Hausherr Date: Wed, 19 Mar 2025 15:38:16 +0100 Subject: [PATCH 30/30] Adjust test Test needs to be adjusted because PDFBox supports the /ActualText feature of PDFBox. --- src/test/java/technology/tabula/TestBasicExtractor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/technology/tabula/TestBasicExtractor.java b/src/test/java/technology/tabula/TestBasicExtractor.java index d120546f..b56fd6ea 100644 --- a/src/test/java/technology/tabula/TestBasicExtractor.java +++ b/src/test/java/technology/tabula/TestBasicExtractor.java @@ -203,7 +203,7 @@ public void testCheckSqueezeDoesntBreak() throws IOException { List> rows = table.getRows(); List firstRow = rows.get(0); List lastRow = rows.get(rows.size() - 1); - assertTrue(firstRow.get(0).getText().equals("Violent crime . . . . . . . . . . . . . . . . . .")); + assertTrue(firstRow.get(0).getText().equals("Violent crime. . . . . . . . . . . . . . . . . .")); assertTrue(lastRow.get(lastRow.size() - 1).getText().equals("(X)")); page.getPDDoc().close(); }