From bc60be27d663f40e132a21407ec81790a8aee361 Mon Sep 17 00:00:00 2001 From: young Date: Wed, 6 Mar 2024 14:53:55 +0800 Subject: [PATCH 01/26] update pdfbox to 3.0.1 --- pom.xml | 2 +- .../technology/tabula/CommandLineApp.java | 3 +- .../java/technology/tabula/debug/Debug.java | 5 +- .../detectors/NurminenDetectionAlgorithm.java | 6 +- src/test/java/technology/tabula/TestCell.java | 3 +- src/test/java/technology/tabula/TestLine.java | 11 +- .../tabula/TestObjectExtractor.java | 21 +- .../tabula/TestProjectionProfile.java | 8 +- .../technology/tabula/TestTableDetection.java | 3 +- .../technology/tabula/TestTextElement.java | 393 +++++++++--------- .../java/technology/tabula/TestUtils.java | 3 +- .../technology/tabula/UtilsForTesting.java | 7 +- 12 files changed, 238 insertions(+), 227 deletions(-) diff --git a/pom.xml b/pom.xml index 27a03e73..6c71b426 100644 --- a/pom.xml +++ b/pom.xml @@ -262,7 +262,7 @@ org.apache.pdfbox pdfbox - 2.0.28 + 3.0.1 diff --git a/src/main/java/technology/tabula/CommandLineApp.java b/src/main/java/technology/tabula/CommandLineApp.java index 3a6773a9..1b422303 100644 --- a/src/main/java/technology/tabula/CommandLineApp.java +++ b/src/main/java/technology/tabula/CommandLineApp.java @@ -15,6 +15,7 @@ import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.cli.DefaultParser; +import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import technology.tabula.detectors.DetectionAlgorithm; @@ -158,7 +159,7 @@ public void extractFileInto(File pdfFile, File outputFile) throws ParseException private void extractFile(File pdfFile, Appendable outFile) throws ParseException { PDDocument pdfDocument = null; try { - pdfDocument = this.password == null ? PDDocument.load(pdfFile) : PDDocument.load(pdfFile, this.password); + pdfDocument = this.password == null ? Loader.loadPDF(pdfFile) : Loader.loadPDF(pdfFile,password); PageIterator pageIterator = getPageIterator(pdfDocument); List tables = new ArrayList<>(); diff --git a/src/main/java/technology/tabula/debug/Debug.java b/src/main/java/technology/tabula/debug/Debug.java index 91609045..d6d257ce 100644 --- a/src/main/java/technology/tabula/debug/Debug.java +++ b/src/main/java/technology/tabula/debug/Debug.java @@ -16,6 +16,7 @@ import java.util.List; import org.apache.commons.cli.*; +import org.apache.pdfbox.Loader; import technology.tabula.Cell; import technology.tabula.CommandLineApp; import technology.tabula.Line; @@ -215,7 +216,7 @@ public static void renderPage(String pdfPath, String outPath, int pageNumber, Re boolean drawColumns, boolean drawCharacters, boolean drawArea, boolean drawCells, boolean drawUnprocessedRulings, boolean drawProjectionProfile, boolean drawClippingPaths, boolean drawDetectedTables) throws IOException { - PDDocument document = PDDocument.load(new File(pdfPath)); + PDDocument document = Loader.loadPDF(new File(pdfPath)); ObjectExtractor oe = new ObjectExtractor(document); @@ -349,7 +350,7 @@ public static void main(String[] args) throws IOException { if (pages == null) { // user specified all pages - PDDocument document = PDDocument.load(pdfFile); + PDDocument document = Loader.loadPDF(pdfFile); int numPages = document.getNumberOfPages(); pages = new ArrayList<>(numPages); diff --git a/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java b/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java index fb43622a..9a377854 100644 --- a/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java +++ b/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java @@ -805,12 +805,12 @@ private PDDocument removeText(PDPage page) throws IOException { PDFStreamParser parser = new PDFStreamParser(page); parser.parse(); - List tokens = parser.getTokens(); List newTokens = new ArrayList<>(); - for (Object token : tokens) { + while (page.hasContents()) { + Object token = parser.parseNextToken(); if (token instanceof Operator) { Operator op = (Operator) token; - if (op.getName().equals("TJ") || op.getName().equals("Tj")) { + if ("TJ".equals(op.getName()) || "Tj".equals(op.getName())) { //remove the one argument to this operator newTokens.remove(newTokens.size() - 1); continue; diff --git a/src/test/java/technology/tabula/TestCell.java b/src/test/java/technology/tabula/TestCell.java index de1b8cb8..2795565c 100644 --- a/src/test/java/technology/tabula/TestCell.java +++ b/src/test/java/technology/tabula/TestCell.java @@ -6,6 +6,7 @@ import java.util.ArrayList; import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.Test; public class TestCell { @@ -31,7 +32,7 @@ public void testGetTextElements() { Cell cell = new Cell(0, 0, 0, 0); assertTrue(cell.getTextElements().isEmpty()); - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); List tList = new ArrayList<>(); tList.add(tChunk); diff --git a/src/test/java/technology/tabula/TestLine.java b/src/test/java/technology/tabula/TestLine.java index 90df0e31..f7a6a88d 100644 --- a/src/test/java/technology/tabula/TestLine.java +++ b/src/test/java/technology/tabula/TestLine.java @@ -6,6 +6,7 @@ import java.util.List; import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.Test; public class TestLine { @@ -14,7 +15,7 @@ public class TestLine { public void testSetTextElements() { Line line = new Line(); - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); List tList = new ArrayList<>(); tList.add(tChunk); @@ -28,7 +29,7 @@ public void testSetTextElements() { public void testAddTextChunkIntTextChunk() { Line line = new Line(); - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); line.addTextChunk(3, tChunk); @@ -39,7 +40,7 @@ public void testAddTextChunkIntTextChunk() { public void testLessThanAddTextChunkIntTextChunk() { Line line = new Line(); - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); line.addTextChunk(0, tChunk); line.addTextChunk(0, tChunk); @@ -51,7 +52,7 @@ public void testLessThanAddTextChunkIntTextChunk() { public void testErrorAddTextChunkIntTextChunk() { Line line = new Line(); - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0,new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); line.addTextChunk(-1, tChunk); } @@ -60,7 +61,7 @@ public void testErrorAddTextChunkIntTextChunk() { public void testToString() { Line line = new Line(); - TextElement tElement = new TextElement(0, 0, 0, 0, PDType1Font.HELVETICA_BOLD, 10, "test", 5); + TextElement tElement = new TextElement(0, 0, 0, 0, new PDType1Font(Standard14Fonts.FontName.HELVETICA_BOLD), 10, "test", 5); TextChunk tChunk = new TextChunk(tElement); line.addTextChunk(0, tChunk); line.addTextChunk(0, tChunk); diff --git a/src/test/java/technology/tabula/TestObjectExtractor.java b/src/test/java/technology/tabula/TestObjectExtractor.java index 9db7ad18..69864c61 100644 --- a/src/test/java/technology/tabula/TestObjectExtractor.java +++ b/src/test/java/technology/tabula/TestObjectExtractor.java @@ -7,6 +7,7 @@ import java.util.ArrayList; import java.util.List; +import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.junit.Test; @@ -21,7 +22,7 @@ public void testWrongPasswordRaisesException() throws IOException { @Test(expected = IOException.class) public void testEmptyOnEncryptedFileRaisesException() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/encrypted.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/encrypted.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { oe.extract().next(); } @@ -29,7 +30,7 @@ public void testEmptyOnEncryptedFileRaisesException() throws IOException { @Test public void testCanReadPDFWithOwnerEncryption() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { PageIterator pi = oe.extract(); int i = 0; @@ -44,7 +45,7 @@ public void testCanReadPDFWithOwnerEncryption() throws IOException { @Test public void testGoodPassword() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/encrypted.pdf"), "userpassword"); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/encrypted.pdf"), "userpassword"); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { List pages = new ArrayList<>(); PageIterator pi = oe.extract(); @@ -58,7 +59,7 @@ public void testGoodPassword() throws IOException { @Test public void testTextExtractionDoesNotRaise() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/rotated_page.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/rotated_page.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { PageIterator pi = oe.extract(); @@ -70,7 +71,7 @@ public void testTextExtractionDoesNotRaise() throws IOException { @Test public void testShouldDetectRulings() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/should_detect_rulings.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/should_detect_rulings.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { PageIterator pi = oe.extract(); @@ -85,7 +86,7 @@ public void testShouldDetectRulings() throws IOException { @Test public void testDontThrowNPEInShfill() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/labor.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/labor.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { PageIterator pi = oe.extract(); @@ -101,7 +102,7 @@ public void testDontThrowNPEInShfill() throws IOException { @Test public void testExtractOnePage() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); assertEquals(2, pdf_document.getNumberOfPages()); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { @@ -114,7 +115,7 @@ public void testExtractOnePage() throws IOException { @Test(expected = IndexOutOfBoundsException.class) public void testExtractWrongPageNumber() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); assertEquals(2, pdf_document.getNumberOfPages()); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { @@ -124,7 +125,7 @@ public void testExtractWrongPageNumber() throws IOException { @Test public void testTextElementsContainedInPage() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/cs-en-us-pbms.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/cs-en-us-pbms.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { Page page = oe.extractPage(1); @@ -137,7 +138,7 @@ public void testTextElementsContainedInPage() throws IOException { } @Test public void testDoNotNPEInPointComparator() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/npe_issue_206.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/npe_issue_206.pdf")); try (ObjectExtractor oe = new ObjectExtractor(pdf_document)) { Page p = oe.extractPage(1); diff --git a/src/test/java/technology/tabula/TestProjectionProfile.java b/src/test/java/technology/tabula/TestProjectionProfile.java index e7af882f..e6d93b39 100644 --- a/src/test/java/technology/tabula/TestProjectionProfile.java +++ b/src/test/java/technology/tabula/TestProjectionProfile.java @@ -8,6 +8,7 @@ import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.Before; import org.junit.Test; @@ -20,9 +21,10 @@ public class TestProjectionProfile { public void setUpProjectionProfile() { PDPage pdPage = new PDPage(); PDDocument pdDocument = new PDDocument(); - - TextElement textElement = new TextElement(5f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "test", 1f); - TextElement textElement2 = new TextElement(5f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "test", 1f); + + PDType1Font font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + TextElement textElement = new TextElement(5f, 15f, 10f, 20f, font, 1f, "test", 1f); + TextElement textElement2 = new TextElement(5f, 15f, 10f, 20f, font, 1f, "test", 1f); List textList = new ArrayList<>(); textList.add(textElement); textList.add(textElement2); diff --git a/src/test/java/technology/tabula/TestTableDetection.java b/src/test/java/technology/tabula/TestTableDetection.java index 6e58f6a4..80d21350 100644 --- a/src/test/java/technology/tabula/TestTableDetection.java +++ b/src/test/java/technology/tabula/TestTableDetection.java @@ -11,6 +11,7 @@ import static org.junit.Assert.*; import com.google.gson.Gson; +import org.apache.pdfbox.Loader; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; @@ -162,7 +163,7 @@ public void testDetectionOfTables() throws Exception { NodeList tables = regionDocument.getElementsByTagName("table"); // tabula extractors - PDDocument pdfDocument = PDDocument.load(this.pdf); + PDDocument pdfDocument = Loader.loadPDF(this.pdf); ObjectExtractor extractor = new ObjectExtractor(pdfDocument); // parse expected tables from the ground truth dataset diff --git a/src/test/java/technology/tabula/TestTextElement.java b/src/test/java/technology/tabula/TestTextElement.java index feaaa5e6..3db1ca31 100644 --- a/src/test/java/technology/tabula/TestTextElement.java +++ b/src/test/java/technology/tabula/TestTextElement.java @@ -3,205 +3,208 @@ import java.util.ArrayList; import java.util.List; +import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDType1Font; +import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.Assert; import org.junit.Test; public class TestTextElement { - - - @Test - public void createTextElement() { - - TextElement textElement = new TextElement(5f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f); - - Assert.assertNotNull(textElement); - Assert.assertEquals("A", textElement.getText()); - Assert.assertEquals(1f, textElement.getFontSize(), 0); - Assert.assertEquals(15f, textElement.getLeft(), 0); - Assert.assertEquals(5f, textElement.getTop(), 0); - Assert.assertEquals(10f, textElement.getWidth(), 0); - Assert.assertEquals(20f, textElement.getHeight(), 0); - Assert.assertEquals(PDType1Font.HELVETICA, textElement.getFont()); - Assert.assertEquals(1f, textElement.getWidthOfSpace(), 0); - Assert.assertEquals(0f, textElement.getDirection(), 0); - - - } - - @Test - public void createTextElementWithDirection() { - - TextElement textElement = new TextElement(5f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f); - - Assert.assertNotNull(textElement); - Assert.assertEquals("A", textElement.getText()); - Assert.assertEquals(1f, textElement.getFontSize(), 0); - Assert.assertEquals(15f, textElement.getLeft(), 0); - Assert.assertEquals(5f, textElement.getTop(), 0); - Assert.assertEquals(10f, textElement.getWidth(), 0); - Assert.assertEquals(20f, textElement.getHeight(), 0); - Assert.assertEquals(PDType1Font.HELVETICA, textElement.getFont()); - Assert.assertEquals(1f, textElement.getWidthOfSpace(), 0); - Assert.assertEquals(6f, textElement.getDirection(), 0); - - - } - - @Test - public void mergeFourElementsIntoFourWords() { - - List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(20f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - elements.add(new TextElement(40f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - elements.add(new TextElement(60f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - - List words = TextElement.mergeWords(elements); - - List expectedWords = new ArrayList<>(); - expectedWords.add(new TextChunk(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f))); - expectedWords.add(new TextChunk(new TextElement(20f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f))); - expectedWords.add(new TextChunk(new TextElement(40f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f))); - expectedWords.add(new TextChunk(new TextElement(60f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f))); - - Assert.assertEquals(expectedWords, words); - - } - - @Test - public void mergeFourElementsIntoOneWord() { - - List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - elements.add(new TextElement(0f, 35f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - elements.add(new TextElement(0f, 45f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - - List words = TextElement.mergeWords(elements); - - List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - textChunk.add(new TextElement(0f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - textChunk.add(new TextElement(0f, 35f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - textChunk.add(new TextElement(0f, 45f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - expectedWords.add(textChunk); - - Assert.assertEquals(expectedWords, words); - - } - - @Test - public void mergeElementsShouldBeIdempotent() { - /* - * a bug in TextElement.merge_words would delete the first TextElement in the array - * it was called with. Discussion here: https://github.com/tabulapdf/tabula-java/issues/78 - */ - - List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - elements.add(new TextElement(0f, 35f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - elements.add(new TextElement(0f, 45f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - - List words = TextElement.mergeWords(elements); - List words2 = TextElement.mergeWords(elements); - Assert.assertEquals(words, words2); - } - - @Test - public void mergeElementsWithSkippingRules() { - - List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 17f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - elements.add(new TextElement(0.001f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, " ", 1f, 6f)); - elements.add(new TextElement(0f, 35f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - elements.add(new TextElement(0f, 45f, 10f, 20f, PDType1Font.TIMES_ROMAN, 10f, "D", 1f, 6f)); - - List words = TextElement.mergeWords(elements); - - List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - textChunk.add(new TextElement(0f, 25f, 10f, 20f, PDType1Font.HELVETICA, 1f, "B", 1f, 6f)); - textChunk.add(new TextElement(0f, 35f, 10f, 20f, PDType1Font.HELVETICA, 1f, "C", 1f, 6f)); - textChunk.add(new TextElement(0f, 45f, 10f, 20f, PDType1Font.TIMES_ROMAN, 10f, "D", 1f, 6f)); - expectedWords.add(textChunk); - - Assert.assertEquals(expectedWords, words); - - } - - @Test - public void mergeTenElementsIntoTwoWords() { - - List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "H", 1f, 6f)); - elements.add(new TextElement(0f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - elements.add(new TextElement(0f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "L", 1f, 6f)); - elements.add(new TextElement(0f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 60f, 10f, 20f, PDType1Font.HELVETICA, 1f, "M", 1f, 6f)); - elements.add(new TextElement(0f, 70f, 10f, 20f, PDType1Font.HELVETICA, 1f, "U", 1f, 6f)); - elements.add(new TextElement(0f, 80f, 10f, 20f, PDType1Font.HELVETICA, 1f, "N", 1f, 6f)); - elements.add(new TextElement(0f, 90f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - elements.add(new TextElement(0f, 100f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - - List words = TextElement.mergeWords(elements); - - List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "H", 1f, 6f)); - textChunk.add(new TextElement(0f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - textChunk.add(new TextElement(0f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "L", 1f, 6f)); - textChunk.add(new TextElement(0f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - textChunk.add(new TextElement(0f, 30f, 10.5f, 20f, PDType1Font.HELVETICA, 1f, " ", 1f)); //Check why width=10.5? - expectedWords.add(textChunk); - TextChunk textChunk2 = new TextChunk(new TextElement(0f, 60f, 10f, 20f, PDType1Font.HELVETICA, 1f, "M", 1f, 6f)); - textChunk2.add(new TextElement(0f, 70f, 10f, 20f, PDType1Font.HELVETICA, 1f, "U", 1f, 6f)); - textChunk2.add(new TextElement(0f, 80f, 10f, 20f, PDType1Font.HELVETICA, 1f, "N", 1f, 6f)); - textChunk2.add(new TextElement(0f, 90f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - textChunk2.add(new TextElement(0f, 100f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - expectedWords.add(textChunk2); - - Assert.assertEquals(2, words.size()); - Assert.assertEquals(expectedWords, words); - - } - - @Test - public void mergeTenElementsIntoTwoLines() { - - List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "H", 1f, 6f)); - elements.add(new TextElement(0f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - elements.add(new TextElement(0f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "L", 1f, 6f)); - elements.add(new TextElement(0f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - elements.add(new TextElement(20f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "M", 1f, 6f)); - elements.add(new TextElement(20f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "U", 1f, 6f)); - elements.add(new TextElement(20f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "N", 1f, 6f)); - elements.add(new TextElement(20f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - elements.add(new TextElement(20f, 40f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - - List words = TextElement.mergeWords(elements); - - List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "H", 1f, 6f)); - textChunk.add(new TextElement(0f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - textChunk.add(new TextElement(0f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "L", 1f, 6f)); - textChunk.add(new TextElement(0f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "A", 1f, 6f)); - expectedWords.add(textChunk); - TextChunk textChunk2 = new TextChunk(new TextElement(20f, 0f, 10f, 20f, PDType1Font.HELVETICA, 1f, "M", 1f, 6f)); - textChunk2.add(new TextElement(20f, 10f, 10f, 20f, PDType1Font.HELVETICA, 1f, "U", 1f, 6f)); - textChunk2.add(new TextElement(20f, 20f, 10f, 20f, PDType1Font.HELVETICA, 1f, "N", 1f, 6f)); - textChunk2.add(new TextElement(20f, 30f, 10f, 20f, PDType1Font.HELVETICA, 1f, "D", 1f, 6f)); - textChunk2.add(new TextElement(20f, 40f, 10f, 20f, PDType1Font.HELVETICA, 1f, "O", 1f, 6f)); - expectedWords.add(textChunk2); - - Assert.assertEquals(2, words.size()); - Assert.assertEquals(expectedWords, words); - - } - - + + + @Test + public void createTextElement() { + + TextElement textElement = new TextElement(5f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f); + + Assert.assertNotNull(textElement); + Assert.assertEquals("A", textElement.getText()); + Assert.assertEquals(1f, textElement.getFontSize(), 0); + Assert.assertEquals(15f, textElement.getLeft(), 0); + Assert.assertEquals(5f, textElement.getTop(), 0); + Assert.assertEquals(10f, textElement.getWidth(), 0); + Assert.assertEquals(20f, textElement.getHeight(), 0); + Assert.assertEquals(new PDType1Font(Standard14Fonts.FontName.HELVETICA), textElement.getFont()); + Assert.assertEquals(1f, textElement.getWidthOfSpace(), 0); + Assert.assertEquals(0f, textElement.getDirection(), 0); + + + } + + @Test + public void createTextElementWithDirection() { + + TextElement textElement = new TextElement(5f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f); + + Assert.assertNotNull(textElement); + Assert.assertEquals("A", textElement.getText()); + Assert.assertEquals(1f, textElement.getFontSize(), 0); + Assert.assertEquals(15f, textElement.getLeft(), 0); + Assert.assertEquals(5f, textElement.getTop(), 0); + Assert.assertEquals(10f, textElement.getWidth(), 0); + Assert.assertEquals(20f, textElement.getHeight(), 0); + Assert.assertEquals(new PDType1Font(Standard14Fonts.FontName.HELVETICA), textElement.getFont()); + Assert.assertEquals(1f, textElement.getWidthOfSpace(), 0); + Assert.assertEquals(6f, textElement.getDirection(), 0); + + + } + + @Test + public void mergeFourElementsIntoFourWords() { + + List elements = new ArrayList<>(); + elements.add(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + elements.add(new TextElement(20f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); + elements.add(new TextElement(40f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); + elements.add(new TextElement(60f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + + List words = TextElement.mergeWords(elements); + + List expectedWords = new ArrayList<>(); + expectedWords.add(new TextChunk(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(20f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(40f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(60f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f))); + + Assert.assertEquals(expectedWords, words); + + } + + @Test + public void mergeFourElementsIntoOneWord() { + + List elements = new ArrayList<>(); + elements.add(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); + elements.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); + elements.add(new TextElement(0f, 45f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + + List words = TextElement.mergeWords(elements); + + List expectedWords = new ArrayList<>(); + TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + textChunk.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); + textChunk.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); + textChunk.add(new TextElement(0f, 45f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + expectedWords.add(textChunk); + + Assert.assertEquals(expectedWords, words); + + } + + @Test + public void mergeElementsShouldBeIdempotent() { + /* + * a bug in TextElement.merge_words would delete the first TextElement in the array + * it was called with. Discussion here: https://github.com/tabulapdf/tabula-java/issues/78 + */ + + List elements = new ArrayList<>(); + elements.add(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); + elements.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); + elements.add(new TextElement(0f, 45f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + + List words = TextElement.mergeWords(elements); + List words2 = TextElement.mergeWords(elements); + Assert.assertEquals(words, words2); + } + + @Test + public void mergeElementsWithSkippingRules() { + + List elements = new ArrayList<>(); + elements.add(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 17f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); + elements.add(new TextElement(0.001f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, " ", 1f, 6f)); + elements.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); + PDFont TIMES_ROMAN = new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN); + elements.add(new TextElement(0f, 45f, 10f, 20f, TIMES_ROMAN, 10f, "D", 1f, 6f)); + + List words = TextElement.mergeWords(elements); + + List expectedWords = new ArrayList<>(); + TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + textChunk.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); + textChunk.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); + textChunk.add(new TextElement(0f, 45f, 10f, 20f, TIMES_ROMAN, 10f, "D", 1f, 6f)); + expectedWords.add(textChunk); + + Assert.assertEquals(expectedWords, words); + + } + + @Test + public void mergeTenElementsIntoTwoWords() { + + List elements = new ArrayList<>(); + elements.add(new TextElement(0f, 0f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "H", 1f, 6f)); + elements.add(new TextElement(0f, 10f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "O", 1f, 6f)); + elements.add(new TextElement(0f, 20f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "L", 1f, 6f)); + elements.add(new TextElement(0f, 30f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 60f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "M", 1f, 6f)); + elements.add(new TextElement(0f, 70f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "U", 1f, 6f)); + elements.add(new TextElement(0f, 80f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "N", 1f, 6f)); + elements.add(new TextElement(0f, 90f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + elements.add(new TextElement(0f, 100f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "O", 1f, 6f)); + + List words = TextElement.mergeWords(elements); + + List expectedWords = new ArrayList<>(); + TextChunk textChunk = new TextChunk(new TextElement(0f, 0f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "H", 1f, 6f)); + textChunk.add(new TextElement(0f, 10f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "O", 1f, 6f)); + textChunk.add(new TextElement(0f, 20f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "L", 1f, 6f)); + textChunk.add(new TextElement(0f, 30f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); + textChunk.add(new TextElement(0f, 30f, 10.5f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, " ", 1f)); //Check why width=10.5? + expectedWords.add(textChunk); + TextChunk textChunk2 = new TextChunk(new TextElement(0f, 60f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "M", 1f, 6f)); + textChunk2.add(new TextElement(0f, 70f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "U", 1f, 6f)); + textChunk2.add(new TextElement(0f, 80f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "N", 1f, 6f)); + textChunk2.add(new TextElement(0f, 90f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + textChunk2.add(new TextElement(0f, 100f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "O", 1f, 6f)); + expectedWords.add(textChunk2); + + Assert.assertEquals(2, words.size()); + Assert.assertEquals(expectedWords, words); + + } + + @Test + public void mergeTenElementsIntoTwoLines() { + + List elements = new ArrayList<>(); + PDType1Font font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + elements.add(new TextElement(0f, 0f, 10f, 20f, font, 1f, "H", 1f, 6f)); + elements.add(new TextElement(0f, 10f, 10f, 20f, font, 1f, "O", 1f, 6f)); + elements.add(new TextElement(0f, 20f, 10f, 20f, font, 1f, "L", 1f, 6f)); + elements.add(new TextElement(0f, 30f, 10f, 20f, font, 1f, "A", 1f, 6f)); + elements.add(new TextElement(20f, 0f, 10f, 20f, font, 1f, "M", 1f, 6f)); + elements.add(new TextElement(20f, 10f, 10f, 20f, font, 1f, "U", 1f, 6f)); + elements.add(new TextElement(20f, 20f, 10f, 20f, font, 1f, "N", 1f, 6f)); + elements.add(new TextElement(20f, 30f, 10f, 20f, font, 1f, "D", 1f, 6f)); + elements.add(new TextElement(20f, 40f, 10f, 20f, font, 1f, "O", 1f, 6f)); + + List words = TextElement.mergeWords(elements); + + List expectedWords = new ArrayList<>(); + TextChunk textChunk = new TextChunk(new TextElement(0f, 0f, 10f, 20f, font, 1f, "H", 1f, 6f)); + textChunk.add(new TextElement(0f, 10f, 10f, 20f, font, 1f, "O", 1f, 6f)); + textChunk.add(new TextElement(0f, 20f, 10f, 20f, font, 1f, "L", 1f, 6f)); + textChunk.add(new TextElement(0f, 30f, 10f, 20f, font, 1f, "A", 1f, 6f)); + expectedWords.add(textChunk); + TextChunk textChunk2 = new TextChunk(new TextElement(20f, 0f, 10f, 20f, font, 1f, "M", 1f, 6f)); + textChunk2.add(new TextElement(20f, 10f, 10f, 20f, font, 1f, "U", 1f, 6f)); + textChunk2.add(new TextElement(20f, 20f, 10f, 20f, font, 1f, "N", 1f, 6f)); + textChunk2.add(new TextElement(20f, 30f, 10f, 20f, font, 1f, "D", 1f, 6f)); + textChunk2.add(new TextElement(20f, 40f, 10f, 20f, font, 1f, "O", 1f, 6f)); + expectedWords.add(textChunk2); + + Assert.assertEquals(2, words.size()); + Assert.assertEquals(expectedWords, words); + + } + } diff --git a/src/test/java/technology/tabula/TestUtils.java b/src/test/java/technology/tabula/TestUtils.java index e68411df..cb85cb7b 100644 --- a/src/test/java/technology/tabula/TestUtils.java +++ b/src/test/java/technology/tabula/TestUtils.java @@ -12,6 +12,7 @@ import java.util.Collections; import java.util.List; +import org.apache.pdfbox.Loader; import org.apache.pdfbox.rendering.ImageType; import org.apache.commons.cli.ParseException; import org.apache.pdfbox.pdmodel.PDDocument; @@ -122,7 +123,7 @@ public void testQuickSortLongList() { @Test public void testJPEG2000DoesNotRaise() throws IOException { - PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/jpeg2000.pdf")); + PDDocument pdf_document = Loader.loadPDF(new File("src/test/resources/technology/tabula/jpeg2000.pdf")); PDPage page = pdf_document.getPage(0); Utils.pageConvertToImage(pdf_document, page, 360, ImageType.RGB); } diff --git a/src/test/java/technology/tabula/UtilsForTesting.java b/src/test/java/technology/tabula/UtilsForTesting.java index 3ee8efde..8d3c91cf 100644 --- a/src/test/java/technology/tabula/UtilsForTesting.java +++ b/src/test/java/technology/tabula/UtilsForTesting.java @@ -7,6 +7,7 @@ import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVPrinter; +import org.apache.pdfbox.Loader; import org.apache.pdfbox.pdmodel.PDDocument; import org.junit.Assert; @@ -23,11 +24,9 @@ public static Page getAreaFromPage(String path, int page, float top, float left, public static Page getPage(String path, int pageNumber) throws IOException { ObjectExtractor oe = null; try { - PDDocument document = PDDocument - .load(new File(path)); + PDDocument document = Loader.loadPDF(new File(path)); oe = new ObjectExtractor(document); - Page page = oe.extract(pageNumber); - return page; + return oe.extract(pageNumber); } finally { if (oe != null) oe.close(); From d0241fb5ff9182d7980c3ccd572cc8bb2dba9357 Mon Sep 17 00:00:00 2001 From: young Date: Wed, 6 Mar 2024 14:59:35 +0800 Subject: [PATCH 02/26] remove useless variable --- .../tabula/detectors/SpreadsheetDetectionAlgorithm.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/java/technology/tabula/detectors/SpreadsheetDetectionAlgorithm.java b/src/main/java/technology/tabula/detectors/SpreadsheetDetectionAlgorithm.java index 243cc3bf..43136ba5 100644 --- a/src/main/java/technology/tabula/detectors/SpreadsheetDetectionAlgorithm.java +++ b/src/main/java/technology/tabula/detectors/SpreadsheetDetectionAlgorithm.java @@ -20,8 +20,6 @@ public class SpreadsheetDetectionAlgorithm implements DetectionAlgorithm { public List detect(Page page) { List cells = SpreadsheetExtractionAlgorithm.findCells(page.getHorizontalRulings(), page.getVerticalRulings()); - SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); - List tables = SpreadsheetExtractionAlgorithm.findSpreadsheetsFromCells(cells); // we want tables to be returned from top to bottom on the page From 63de16a4e102b44ea370919625221561dc783e75 Mon Sep 17 00:00:00 2001 From: young Date: Thu, 11 Apr 2024 14:09:16 +0800 Subject: [PATCH 03/26] exclude junit-jupiter from pdfbox --- pom.xml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pom.xml b/pom.xml index 6c71b426..f60528d0 100644 --- a/pom.xml +++ b/pom.xml @@ -263,6 +263,12 @@ org.apache.pdfbox pdfbox 3.0.1 + + + org.junit.jupiter + junit-jupiter + + From e0ee0728ca398023ab67f59626a55525de0355b0 Mon Sep 17 00:00:00 2001 From: young Date: Thu, 11 Apr 2024 14:31:10 +0800 Subject: [PATCH 04/26] update pdfbox to 3.0.2 --- pom.xml | 341 ++++++++++++++++++++++++++++---------------------------- 1 file changed, 168 insertions(+), 173 deletions(-) diff --git a/pom.xml b/pom.xml index f60528d0..52943fbf 100644 --- a/pom.xml +++ b/pom.xml @@ -1,4 +1,5 @@ - + 4.0.0 technology.tabula tabula @@ -33,16 +34,16 @@ - - snapshots - https://repository.apache.org/content/repositories/snapshots/ - - false - - - true - - + + snapshots + https://repository.apache.org/content/repositories/snapshots/ + + false + + + true + + @@ -109,20 +110,20 @@ - org.apache.maven.plugins - maven-javadoc-plugin - 3.3.1 - - 8 - - - - attach-javadocs - - jar - - - + org.apache.maven.plugins + maven-javadoc-plugin + 3.3.1 + + 8 + + + + attach-javadocs + + jar + + + org.apache.maven.plugins @@ -159,172 +160,166 @@ technology.tabula.CommandLineApp - - - jar-with-dependencies - + + + jar-with-dependencies + - - - org.apache.maven.plugins - maven-surefire-plugin - 2.22.2 - - - -Xms1024m -Xmx2048m - - - - - org.apache.maven.plugins - maven-eclipse-plugin - 2.10 - - true - true - - - - - - - - release - - + org.apache.maven.plugins - maven-javadoc-plugin - 3.3.1 + maven-surefire-plugin + 2.22.2 - 8 + + -Xms1024m -Xmx2048m - - - attach-javadocs - - jar - - - - - org.apache.maven.plugins - maven-source-plugin - 3.2.1 - - - attach-sources - - jar-no-fork - - - - - - org.apache.maven.plugins - maven-gpg-plugin - 1.6 - - - sign-artifacts - verify - - sign - - - - - - - - + + + org.apache.maven.plugins + maven-eclipse-plugin + 2.10 + + true + true + + + + + + + + release + + + + org.apache.maven.plugins + maven-javadoc-plugin + 3.3.1 + + 8 + + + + attach-javadocs + + jar + + + + + + org.apache.maven.plugins + maven-source-plugin + 3.2.1 + + + attach-sources + + jar-no-fork + + + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.6 + + + sign-artifacts + verify + + sign + + + + + + + + - - - org.locationtech.jts - jts-core - 1.18.1 - + + + org.locationtech.jts + jts-core + 1.18.1 + - - org.slf4j - slf4j-api - 1.7.35 - + + org.slf4j + slf4j-api + 1.7.35 + - - org.slf4j - slf4j-simple - 1.7.32 - + + org.slf4j + slf4j-simple + 1.7.32 + - - org.apache.pdfbox - pdfbox - 3.0.1 - - - org.junit.jupiter - junit-jupiter - - - + + org.apache.pdfbox + pdfbox + 3.0.2 + - - org.bouncycastle - bcprov-jdk15on - 1.70 - + + org.bouncycastle + bcprov-jdk15on + 1.70 + - - org.bouncycastle - bcmail-jdk15on - 1.70 - + + org.bouncycastle + bcmail-jdk15on + 1.70 + - - junit - junit - 4.13.2 - test - + + junit + junit + 4.13.2 + test + - - commons-cli - commons-cli - 1.4 - + + commons-cli + commons-cli + 1.4 + - - org.apache.commons - commons-csv - 1.9.0 - + + org.apache.commons + commons-csv + 1.9.0 + - - com.google.code.gson - gson - 2.9.0 - + + com.google.code.gson + gson + 2.9.0 + - - com.github.jai-imageio - jai-imageio-core - 1.4.0 - + + com.github.jai-imageio + jai-imageio-core + 1.4.0 + - - com.github.jai-imageio - jai-imageio-jpeg2000 - 1.4.0 - + + com.github.jai-imageio + jai-imageio-jpeg2000 + 1.4.0 + - - org.apache.pdfbox - jbig2-imageio - 3.0.4 - - + + org.apache.pdfbox + jbig2-imageio + 3.0.4 + + From 20b1053a24402a1e3a587ee90211661027d66484 Mon Sep 17 00:00:00 2001 From: young Date: Mon, 29 Apr 2024 17:40:46 +0800 Subject: [PATCH 05/26] fix: oom for removeText --- .../detectors/NurminenDetectionAlgorithm.java | 86 ++++++++++++------- .../technology/tabula/TestTableDetection.java | 53 ++++++------ 2 files changed, 79 insertions(+), 60 deletions(-) diff --git a/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java b/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java index 9a377854..86639f66 100644 --- a/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java +++ b/src/main/java/technology/tabula/detectors/NurminenDetectionAlgorithm.java @@ -1,14 +1,8 @@ package technology.tabula.detectors; -import java.awt.geom.Line2D; -import java.awt.geom.Point2D; -import java.awt.image.BufferedImage; -import java.awt.image.Raster; -import java.io.IOException; -import java.io.OutputStream; -import java.util.*; - +import org.apache.pdfbox.contentstream.PDContentStream; import org.apache.pdfbox.contentstream.operator.Operator; +import org.apache.pdfbox.contentstream.operator.OperatorName; import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.pdfparser.PDFStreamParser; import org.apache.pdfbox.pdfwriter.ContentStreamWriter; @@ -16,16 +10,17 @@ import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.common.PDStream; import org.apache.pdfbox.rendering.ImageType; - -import technology.tabula.Line; -import technology.tabula.Page; -import technology.tabula.Rectangle; -import technology.tabula.Ruling; -import technology.tabula.TextChunk; -import technology.tabula.TextElement; -import technology.tabula.Utils; +import technology.tabula.*; import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; +import java.awt.geom.Line2D; +import java.awt.geom.Point2D; +import java.awt.image.BufferedImage; +import java.awt.image.Raster; +import java.io.IOException; +import java.io.OutputStream; +import java.util.*; + /** * Created by matt on 2015-12-17. *

@@ -799,25 +794,10 @@ private List getVerticalRulings(BufferedImage image) { return verticalRulings; } - - // taken from http://www.docjar.com/html/api/org/apache/pdfbox/examples/util/RemoveAllText.java.html private PDDocument removeText(PDPage page) throws IOException { PDFStreamParser parser = new PDFStreamParser(page); parser.parse(); - List newTokens = new ArrayList<>(); - while (page.hasContents()) { - Object token = parser.parseNextToken(); - if (token instanceof Operator) { - Operator op = (Operator) token; - if ("TJ".equals(op.getName()) || "Tj".equals(op.getName())) { - //remove the one argument to this operator - newTokens.remove(newTokens.size() - 1); - continue; - } - } - newTokens.add(token); - } PDDocument document = new PDDocument(); PDPage newPage = document.importPage(page); @@ -826,9 +806,51 @@ private PDDocument removeText(PDPage page) throws IOException { PDStream newContents = new PDStream(document); OutputStream out = newContents.createOutputStream(COSName.FLATE_DECODE); ContentStreamWriter writer = new ContentStreamWriter(out); - writer.writeTokens(newTokens); + List tokensWithoutText = createTokensWithoutText(page); + writer.writeTokens(tokensWithoutText); out.close(); newPage.setContents(newContents); return document; } + + + /** + * @param contentStream contentStream + * @return newTokens + * @throws IOException When parseNextToken on Error + * @see ... + */ + private static List createTokensWithoutText(PDContentStream contentStream) throws IOException { + PDFStreamParser parser = new PDFStreamParser(contentStream); + Object token = parser.parseNextToken(); + List newTokens = new ArrayList<>(); + while (token != null) { + if (token instanceof Operator) { + Operator op = (Operator) token; + String opName = op.getName(); + if (OperatorName.SHOW_TEXT_ADJUSTED.equals(opName) + || OperatorName.SHOW_TEXT.equals(opName) + || OperatorName.SHOW_TEXT_LINE.equals(opName)) { + // remove the argument to this operator + newTokens.remove(newTokens.size() - 1); + + token = parser.parseNextToken(); + continue; + } else if (OperatorName.SHOW_TEXT_LINE_AND_SPACE.equals(opName)) { + // remove the 3 arguments to this operator + newTokens.remove(newTokens.size() - 1); + newTokens.remove(newTokens.size() - 1); + newTokens.remove(newTokens.size() - 1); + + token = parser.parseNextToken(); + continue; + } + } + newTokens.add(token); + token = parser.parseNextToken(); + } + return newTokens; + } + + } diff --git a/src/test/java/technology/tabula/TestTableDetection.java b/src/test/java/technology/tabula/TestTableDetection.java index 80d21350..c13ff201 100644 --- a/src/test/java/technology/tabula/TestTableDetection.java +++ b/src/test/java/technology/tabula/TestTableDetection.java @@ -1,29 +1,29 @@ package technology.tabula; -import java.io.File; -import java.io.FileWriter; -import java.io.FilenameFilter; -import java.io.IOException; -import java.util.*; -import java.util.logging.Level; -import java.util.logging.Logger; - -import static org.junit.Assert.*; - import com.google.gson.Gson; import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import org.junit.runner.RunWith; import org.junit.runners.Parameterized; -import org.w3c.dom.*; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; +import technology.tabula.detectors.NurminenDetectionAlgorithm; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.*; +import java.util.logging.Level; +import java.util.logging.Logger; -import org.apache.pdfbox.pdmodel.PDDocument; -import technology.tabula.detectors.NurminenDetectionAlgorithm; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; /** * Created by matt on 2015-12-14. @@ -111,15 +111,10 @@ public static Collection data() { String directoryName = "src/test/resources/technology/tabula/icdar2013-dataset/competition-dataset-" + regionCode + "/"; File dir = new File(directoryName); - File[] pdfs = dir.listFiles(new FilenameFilter() { - @Override - public boolean accept(File dir, String name) { - return name.toLowerCase().endsWith(".pdf"); - } - }); + File[] pdfs = dir.listFiles((dir1, name) -> name.toLowerCase().endsWith(".pdf")); for (File pdf : pdfs) { - data.add(new Object[] {pdf}); + data.add(new Object[]{pdf}); } } @@ -163,6 +158,8 @@ public void testDetectionOfTables() throws Exception { NodeList tables = regionDocument.getElementsByTagName("table"); // tabula extractors + + PDDocument pdfDocument = Loader.loadPDF(this.pdf); ObjectExtractor extractor = new ObjectExtractor(pdfDocument); @@ -171,7 +168,7 @@ public void testDetectionOfTables() throws Exception { int numExpectedTables = 0; - for (int i=0; i tablesOnPage = detectionAlgorithm.detect(page); - if (tablesOnPage.size() > 0) { - detectedTables.put(new Integer(page.getPageNumber()), tablesOnPage); + if (!tablesOnPage.isEmpty()) { + detectedTables.put(page.getPageNumber(), tablesOnPage); } } @@ -267,7 +264,7 @@ public void testDetectionOfTables() throws Exception { System.out.println(totalErroneouslyDetectedTables + " tables incorrectly detected"); - if(this.status.isFirstRun()) { + if (this.status.isFirstRun()) { // make the baseline this.status.expectedFailure = failed; this.status.numCorrectlyDetectedTables = this.numCorrectlyDetectedTables; @@ -293,14 +290,14 @@ private List comparePages(Integer page, List detected, List detectedIterator = detected.iterator(); detectedIterator.hasNext();) { + for (Iterator detectedIterator = detected.iterator(); detectedIterator.hasNext(); ) { Rectangle detectedTable = detectedIterator.next(); - for (int i=0; i Date: Mon, 29 Apr 2024 17:51:40 +0800 Subject: [PATCH 06/26] fix: unit test --- .../technology/tabula/TestTextElement.java | 109 +++++++++--------- 1 file changed, 57 insertions(+), 52 deletions(-) diff --git a/src/test/java/technology/tabula/TestTextElement.java b/src/test/java/technology/tabula/TestTextElement.java index 3db1ca31..ee0fbf3d 100644 --- a/src/test/java/technology/tabula/TestTextElement.java +++ b/src/test/java/technology/tabula/TestTextElement.java @@ -1,14 +1,14 @@ package technology.tabula; -import java.util.ArrayList; -import java.util.List; - import org.apache.pdfbox.pdmodel.font.PDFont; import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.pdmodel.font.Standard14Fonts; import org.junit.Assert; import org.junit.Test; +import java.util.ArrayList; +import java.util.List; + public class TestTextElement { @@ -24,7 +24,7 @@ public void createTextElement() { Assert.assertEquals(5f, textElement.getTop(), 0); Assert.assertEquals(10f, textElement.getWidth(), 0); Assert.assertEquals(20f, textElement.getHeight(), 0); - Assert.assertEquals(new PDType1Font(Standard14Fonts.FontName.HELVETICA), textElement.getFont()); + Assert.assertEquals(Standard14Fonts.FontName.HELVETICA.getName(), textElement.getFont().getName()); Assert.assertEquals(1f, textElement.getWidthOfSpace(), 0); Assert.assertEquals(0f, textElement.getDirection(), 0); @@ -43,7 +43,7 @@ public void createTextElementWithDirection() { Assert.assertEquals(5f, textElement.getTop(), 0); Assert.assertEquals(10f, textElement.getWidth(), 0); Assert.assertEquals(20f, textElement.getHeight(), 0); - Assert.assertEquals(new PDType1Font(Standard14Fonts.FontName.HELVETICA), textElement.getFont()); + Assert.assertEquals(Standard14Fonts.FontName.HELVETICA.getName(), textElement.getFont().getName()); Assert.assertEquals(1f, textElement.getWidthOfSpace(), 0); Assert.assertEquals(6f, textElement.getDirection(), 0); @@ -54,18 +54,19 @@ public void createTextElementWithDirection() { public void mergeFourElementsIntoFourWords() { List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - elements.add(new TextElement(20f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); - elements.add(new TextElement(40f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); - elements.add(new TextElement(60f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + PDType1Font font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + elements.add(new TextElement(0f, 15f, 10f, 20f, font, 1f, "A", 1f, 6f)); + elements.add(new TextElement(20f, 15f, 10f, 20f, font, 1f, "B", 1f, 6f)); + elements.add(new TextElement(40f, 15f, 10f, 20f, font, 1f, "C", 1f, 6f)); + elements.add(new TextElement(60f, 15f, 10f, 20f, font, 1f, "D", 1f, 6f)); List words = TextElement.mergeWords(elements); List expectedWords = new ArrayList<>(); - expectedWords.add(new TextChunk(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f))); - expectedWords.add(new TextChunk(new TextElement(20f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f))); - expectedWords.add(new TextChunk(new TextElement(40f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f))); - expectedWords.add(new TextChunk(new TextElement(60f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(0f, 15f, 10f, 20f, font, 1f, "A", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(20f, 15f, 10f, 20f, font, 1f, "B", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(40f, 15f, 10f, 20f, font, 1f, "C", 1f, 6f))); + expectedWords.add(new TextChunk(new TextElement(60f, 15f, 10f, 20f, font, 1f, "D", 1f, 6f))); Assert.assertEquals(expectedWords, words); @@ -75,18 +76,19 @@ public void mergeFourElementsIntoFourWords() { public void mergeFourElementsIntoOneWord() { List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); - elements.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); - elements.add(new TextElement(0f, 45f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + PDType1Font font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + elements.add(new TextElement(0f, 15f, 10f, 20f, font, 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 25f, 10f, 20f, font, 1f, "B", 1f, 6f)); + elements.add(new TextElement(0f, 35f, 10f, 20f, font, 1f, "C", 1f, 6f)); + elements.add(new TextElement(0f, 45f, 10f, 20f, font, 1f, "D", 1f, 6f)); List words = TextElement.mergeWords(elements); List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - textChunk.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); - textChunk.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); - textChunk.add(new TextElement(0f, 45f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, font, 1f, "A", 1f, 6f)); + textChunk.add(new TextElement(0f, 25f, 10f, 20f, font, 1f, "B", 1f, 6f)); + textChunk.add(new TextElement(0f, 35f, 10f, 20f, font, 1f, "C", 1f, 6f)); + textChunk.add(new TextElement(0f, 45f, 10f, 20f, font, 1f, "D", 1f, 6f)); expectedWords.add(textChunk); Assert.assertEquals(expectedWords, words); @@ -101,10 +103,11 @@ public void mergeElementsShouldBeIdempotent() { */ List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); - elements.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); - elements.add(new TextElement(0f, 45f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); + PDType1Font font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + elements.add(new TextElement(0f, 15f, 10f, 20f, font, 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 25f, 10f, 20f, font, 1f, "B", 1f, 6f)); + elements.add(new TextElement(0f, 35f, 10f, 20f, font, 1f, "C", 1f, 6f)); + elements.add(new TextElement(0f, 45f, 10f, 20f, font, 1f, "D", 1f, 6f)); List words = TextElement.mergeWords(elements); List words2 = TextElement.mergeWords(elements); @@ -115,20 +118,21 @@ public void mergeElementsShouldBeIdempotent() { public void mergeElementsWithSkippingRules() { List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 17f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); - elements.add(new TextElement(0.001f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, " ", 1f, 6f)); - elements.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); + PDType1Font font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + elements.add(new TextElement(0f, 15f, 10f, 20f, font, 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 17f, 10f, 20f, font, 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 25f, 10f, 20f, font, 1f, "B", 1f, 6f)); + elements.add(new TextElement(0.001f, 25f, 10f, 20f, font, 1f, " ", 1f, 6f)); + elements.add(new TextElement(0f, 35f, 10f, 20f, font, 1f, "C", 1f, 6f)); PDFont TIMES_ROMAN = new PDType1Font(Standard14Fonts.FontName.TIMES_ROMAN); elements.add(new TextElement(0f, 45f, 10f, 20f, TIMES_ROMAN, 10f, "D", 1f, 6f)); List words = TextElement.mergeWords(elements); List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - textChunk.add(new TextElement(0f, 25f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "B", 1f, 6f)); - textChunk.add(new TextElement(0f, 35f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "C", 1f, 6f)); + TextChunk textChunk = new TextChunk(new TextElement(0f, 15f, 10f, 20f, font, 1f, "A", 1f, 6f)); + textChunk.add(new TextElement(0f, 25f, 10f, 20f, font, 1f, "B", 1f, 6f)); + textChunk.add(new TextElement(0f, 35f, 10f, 20f, font, 1f, "C", 1f, 6f)); textChunk.add(new TextElement(0f, 45f, 10f, 20f, TIMES_ROMAN, 10f, "D", 1f, 6f)); expectedWords.add(textChunk); @@ -140,30 +144,31 @@ public void mergeElementsWithSkippingRules() { public void mergeTenElementsIntoTwoWords() { List elements = new ArrayList<>(); - elements.add(new TextElement(0f, 0f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "H", 1f, 6f)); - elements.add(new TextElement(0f, 10f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "O", 1f, 6f)); - elements.add(new TextElement(0f, 20f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "L", 1f, 6f)); - elements.add(new TextElement(0f, 30f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - elements.add(new TextElement(0f, 60f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "M", 1f, 6f)); - elements.add(new TextElement(0f, 70f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "U", 1f, 6f)); - elements.add(new TextElement(0f, 80f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "N", 1f, 6f)); - elements.add(new TextElement(0f, 90f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); - elements.add(new TextElement(0f, 100f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "O", 1f, 6f)); + PDType1Font font = new PDType1Font(Standard14Fonts.FontName.HELVETICA); + elements.add(new TextElement(0f, 0f, 10f, 20f, font, 1f, "H", 1f, 6f)); + elements.add(new TextElement(0f, 10f, 10f, 20f, font, 1f, "O", 1f, 6f)); + elements.add(new TextElement(0f, 20f, 10f, 20f, font, 1f, "L", 1f, 6f)); + elements.add(new TextElement(0f, 30f, 10f, 20f, font, 1f, "A", 1f, 6f)); + elements.add(new TextElement(0f, 60f, 10f, 20f, font, 1f, "M", 1f, 6f)); + elements.add(new TextElement(0f, 70f, 10f, 20f, font, 1f, "U", 1f, 6f)); + elements.add(new TextElement(0f, 80f, 10f, 20f, font, 1f, "N", 1f, 6f)); + elements.add(new TextElement(0f, 90f, 10f, 20f, font, 1f, "D", 1f, 6f)); + elements.add(new TextElement(0f, 100f, 10f, 20f, font, 1f, "O", 1f, 6f)); List words = TextElement.mergeWords(elements); List expectedWords = new ArrayList<>(); - TextChunk textChunk = new TextChunk(new TextElement(0f, 0f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "H", 1f, 6f)); - textChunk.add(new TextElement(0f, 10f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "O", 1f, 6f)); - textChunk.add(new TextElement(0f, 20f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "L", 1f, 6f)); - textChunk.add(new TextElement(0f, 30f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "A", 1f, 6f)); - textChunk.add(new TextElement(0f, 30f, 10.5f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, " ", 1f)); //Check why width=10.5? + TextChunk textChunk = new TextChunk(new TextElement(0f, 0f, 10f, 20f, font, 1f, "H", 1f, 6f)); + textChunk.add(new TextElement(0f, 10f, 10f, 20f, font, 1f, "O", 1f, 6f)); + textChunk.add(new TextElement(0f, 20f, 10f, 20f, font, 1f, "L", 1f, 6f)); + textChunk.add(new TextElement(0f, 30f, 10f, 20f, font, 1f, "A", 1f, 6f)); + textChunk.add(new TextElement(0f, 30f, 10.5f, 20f, font, 1f, " ", 1f)); //Check why width=10.5? expectedWords.add(textChunk); - TextChunk textChunk2 = new TextChunk(new TextElement(0f, 60f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "M", 1f, 6f)); - textChunk2.add(new TextElement(0f, 70f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "U", 1f, 6f)); - textChunk2.add(new TextElement(0f, 80f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "N", 1f, 6f)); - textChunk2.add(new TextElement(0f, 90f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "D", 1f, 6f)); - textChunk2.add(new TextElement(0f, 100f, 10f, 20f, new PDType1Font(Standard14Fonts.FontName.HELVETICA), 1f, "O", 1f, 6f)); + TextChunk textChunk2 = new TextChunk(new TextElement(0f, 60f, 10f, 20f, font, 1f, "M", 1f, 6f)); + textChunk2.add(new TextElement(0f, 70f, 10f, 20f, font, 1f, "U", 1f, 6f)); + textChunk2.add(new TextElement(0f, 80f, 10f, 20f, font, 1f, "N", 1f, 6f)); + textChunk2.add(new TextElement(0f, 90f, 10f, 20f, font, 1f, "D", 1f, 6f)); + textChunk2.add(new TextElement(0f, 100f, 10f, 20f, font, 1f, "O", 1f, 6f)); expectedWords.add(textChunk2); Assert.assertEquals(2, words.size()); From 6d59cddd5e4523d74aa03739be5992d35372fdd3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Feb 2023 21:12:28 +0000 Subject: [PATCH 07/26] Bump maven-compiler-plugin from 3.8.1 to 3.11.0 Bumps [maven-compiler-plugin](https://github.com/apache/maven-compiler-plugin) from 3.8.1 to 3.11.0. - [Release notes](https://github.com/apache/maven-compiler-plugin/releases) - [Commits](https://github.com/apache/maven-compiler-plugin/compare/maven-compiler-plugin-3.8.1...maven-compiler-plugin-3.11.0) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-compiler-plugin dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 52943fbf..b3344e12 100644 --- a/pom.xml +++ b/pom.xml @@ -147,7 +147,7 @@ maven-compiler-plugin - 3.8.1 + 3.11.0 1.8 1.8 From 2bdeb954675cb2ad05431210d3f06db74a490fe9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:29:09 +0000 Subject: [PATCH 08/26] Bump org.apache.maven.plugins:maven-gpg-plugin from 1.6 to 3.2.4 Bumps [org.apache.maven.plugins:maven-gpg-plugin](https://github.com/apache/maven-gpg-plugin) from 1.6 to 3.2.4. - [Release notes](https://github.com/apache/maven-gpg-plugin/releases) - [Commits](https://github.com/apache/maven-gpg-plugin/compare/maven-gpg-plugin-1.6...maven-gpg-plugin-3.2.4) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-gpg-plugin dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index b3344e12..7f30e7a4 100644 --- a/pom.xml +++ b/pom.xml @@ -128,7 +128,7 @@ org.apache.maven.plugins maven-gpg-plugin - 1.6 + 3.2.4 sign-artifacts @@ -225,7 +225,7 @@ org.apache.maven.plugins maven-gpg-plugin - 1.6 + 3.2.4 sign-artifacts From c831cf6ac36c5315b96ff6a49212bb67908ce48e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:28:58 +0000 Subject: [PATCH 09/26] Bump commons-cli:commons-cli from 1.4 to 1.8.0 Bumps commons-cli:commons-cli from 1.4 to 1.8.0. --- updated-dependencies: - dependency-name: commons-cli:commons-cli dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 7f30e7a4..eb362e89 100644 --- a/pom.xml +++ b/pom.xml @@ -288,7 +288,7 @@ commons-cli commons-cli - 1.4 + 1.8.0 From 9dc64f867a01e69e6e929feaa5a909c02b9bd3e9 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:28:56 +0000 Subject: [PATCH 10/26] Bump org.slf4j:slf4j-api from 1.7.35 to 2.0.13 Bumps org.slf4j:slf4j-api from 1.7.35 to 2.0.13. --- updated-dependencies: - dependency-name: org.slf4j:slf4j-api dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index eb362e89..adf29ce5 100644 --- a/pom.xml +++ b/pom.xml @@ -251,7 +251,7 @@ org.slf4j slf4j-api - 1.7.35 + 2.0.13 From 3f7445380ec4f48dfc545dd6d33e89d4c501af55 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:28:55 +0000 Subject: [PATCH 11/26] Bump org.slf4j:slf4j-simple from 1.7.32 to 2.0.13 Bumps org.slf4j:slf4j-simple from 1.7.32 to 2.0.13. --- updated-dependencies: - dependency-name: org.slf4j:slf4j-simple dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index adf29ce5..8e0736c4 100644 --- a/pom.xml +++ b/pom.xml @@ -257,7 +257,7 @@ org.slf4j slf4j-simple - 1.7.32 + 2.0.13 From 2ef079f2a14dc6d66c68c5ce8d03853eea7436f4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 21 Jun 2022 21:32:54 +0000 Subject: [PATCH 12/26] Bump jts-core from 1.18.1 to 1.19.0 Bumps jts-core from 1.18.1 to 1.19.0. --- updated-dependencies: - dependency-name: org.locationtech.jts:jts-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 8e0736c4..19bf0133 100644 --- a/pom.xml +++ b/pom.xml @@ -245,7 +245,7 @@ org.locationtech.jts jts-core - 1.18.1 + 1.19.0 From c1e4e326eddc1a2dfe59febf24a569d11bde5cfb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 15 Feb 2023 21:59:29 +0000 Subject: [PATCH 13/26] Bump maven-javadoc-plugin from 3.3.1 to 3.5.0 Bumps [maven-javadoc-plugin](https://github.com/apache/maven-javadoc-plugin) from 3.3.1 to 3.5.0. - [Release notes](https://github.com/apache/maven-javadoc-plugin/releases) - [Commits](https://github.com/apache/maven-javadoc-plugin/compare/maven-javadoc-plugin-3.3.1...maven-javadoc-plugin-3.5.0) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-javadoc-plugin dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/pom.xml b/pom.xml index 19bf0133..749fa0b7 100644 --- a/pom.xml +++ b/pom.xml @@ -74,7 +74,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.3.1 + 3.5.0 true @@ -110,20 +110,20 @@ - org.apache.maven.plugins - maven-javadoc-plugin - 3.3.1 - - 8 - - - - attach-javadocs - - jar - - - + org.apache.maven.plugins + maven-javadoc-plugin + 3.5.0 + + 8 + + + + attach-javadocs + + jar + + + org.apache.maven.plugins From 5761334b86f58723e761b4941f2950d7b6e53d82 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:29:03 +0000 Subject: [PATCH 14/26] Bump org.sonatype.plugins:nexus-staging-maven-plugin from 1.6.8 to 1.7.0 Bumps org.sonatype.plugins:nexus-staging-maven-plugin from 1.6.8 to 1.7.0. --- updated-dependencies: - dependency-name: org.sonatype.plugins:nexus-staging-maven-plugin dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 749fa0b7..6a66162f 100644 --- a/pom.xml +++ b/pom.xml @@ -87,7 +87,7 @@ org.sonatype.plugins nexus-staging-maven-plugin - 1.6.8 + 1.7.0 true ossrh From ab7c4bd54bd20ca03c2bfad400c5cc6c26e34d59 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 16 Jul 2024 21:28:53 +0000 Subject: [PATCH 15/26] Bump org.apache.maven.plugins:maven-source-plugin from 3.2.1 to 3.3.1 Bumps [org.apache.maven.plugins:maven-source-plugin](https://github.com/apache/maven-source-plugin) from 3.2.1 to 3.3.1. - [Release notes](https://github.com/apache/maven-source-plugin/releases) - [Commits](https://github.com/apache/maven-source-plugin/compare/maven-source-plugin-3.2.1...maven-source-plugin-3.3.1) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-source-plugin dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 6a66162f..77cda400 100644 --- a/pom.xml +++ b/pom.xml @@ -99,7 +99,7 @@ org.apache.maven.plugins maven-source-plugin - 3.2.1 + 3.3.1 attach-sources @@ -212,7 +212,7 @@ org.apache.maven.plugins maven-source-plugin - 3.2.1 + 3.3.1 attach-sources From ebe8e30dedfd6f7553046bbe6bbd3640b121d3dd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 17 Jul 2024 21:36:59 +0000 Subject: [PATCH 16/26] Bump org.apache.commons:commons-csv from 1.9.0 to 1.11.0 Bumps [org.apache.commons:commons-csv](https://github.com/apache/commons-csv) from 1.9.0 to 1.11.0. - [Changelog](https://github.com/apache/commons-csv/blob/master/RELEASE-NOTES.txt) - [Commits](https://github.com/apache/commons-csv/compare/rel/commons-csv-1.9.0...rel/commons-csv-1.11.0) --- updated-dependencies: - dependency-name: org.apache.commons:commons-csv dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 77cda400..a45e6089 100644 --- a/pom.xml +++ b/pom.xml @@ -294,7 +294,7 @@ org.apache.commons commons-csv - 1.9.0 + 1.11.0 From db3f6dfd74801c824efd2a25dc26b4a3cb8d7922 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 17 Jul 2024 21:36:45 +0000 Subject: [PATCH 17/26] Bump org.apache.maven.plugins:maven-compiler-plugin Bumps [org.apache.maven.plugins:maven-compiler-plugin](https://github.com/apache/maven-compiler-plugin) from 3.11.0 to 3.13.0. - [Release notes](https://github.com/apache/maven-compiler-plugin/releases) - [Commits](https://github.com/apache/maven-compiler-plugin/compare/maven-compiler-plugin-3.11.0...maven-compiler-plugin-3.13.0) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-compiler-plugin dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index a45e6089..b73d8b1e 100644 --- a/pom.xml +++ b/pom.xml @@ -147,7 +147,7 @@ maven-compiler-plugin - 3.11.0 + 3.13.0 1.8 1.8 From fd3a32c579f672ba17c5f1231985e980c4e3ec4e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 17 Jul 2024 21:36:48 +0000 Subject: [PATCH 18/26] Bump com.google.code.gson:gson from 2.9.0 to 2.11.0 Bumps [com.google.code.gson:gson](https://github.com/google/gson) from 2.9.0 to 2.11.0. - [Release notes](https://github.com/google/gson/releases) - [Changelog](https://github.com/google/gson/blob/main/CHANGELOG.md) - [Commits](https://github.com/google/gson/compare/gson-parent-2.9.0...gson-parent-2.11.0) --- updated-dependencies: - dependency-name: com.google.code.gson:gson dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index b73d8b1e..394ea68f 100644 --- a/pom.xml +++ b/pom.xml @@ -300,7 +300,7 @@ com.google.code.gson gson - 2.9.0 + 2.11.0 From 097559d0a185ca1dda25d7b7ff103e884848c70c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 17 Jul 2024 21:36:52 +0000 Subject: [PATCH 19/26] Bump org.apache.maven.plugins:maven-javadoc-plugin from 3.3.1 to 3.7.0 Bumps [org.apache.maven.plugins:maven-javadoc-plugin](https://github.com/apache/maven-javadoc-plugin) from 3.3.1 to 3.7.0. - [Release notes](https://github.com/apache/maven-javadoc-plugin/releases) - [Commits](https://github.com/apache/maven-javadoc-plugin/compare/maven-javadoc-plugin-3.3.1...maven-javadoc-plugin-3.7.0) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-javadoc-plugin dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 394ea68f..a4871012 100644 --- a/pom.xml +++ b/pom.xml @@ -74,7 +74,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.5.0 + 3.7.0 true @@ -112,7 +112,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.5.0 + 3.7.0 8 @@ -196,7 +196,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.3.1 + 3.7.0 8 From bde6d765cfab25d53ff885de33a4556fc41bb9d7 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 17 Jul 2024 21:36:55 +0000 Subject: [PATCH 20/26] Bump org.apache.maven.plugins:maven-surefire-plugin from 2.22.2 to 3.3.1 Bumps [org.apache.maven.plugins:maven-surefire-plugin](https://github.com/apache/maven-surefire) from 2.22.2 to 3.3.1. - [Release notes](https://github.com/apache/maven-surefire/releases) - [Commits](https://github.com/apache/maven-surefire/compare/surefire-2.22.2...surefire-3.3.1) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-surefire-plugin dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index a4871012..8fd27509 100644 --- a/pom.xml +++ b/pom.xml @@ -169,7 +169,7 @@ org.apache.maven.plugins maven-surefire-plugin - 2.22.2 + 3.3.1 -Xms1024m -Xmx2048m From 0c73e698b979a74cac0e917718b2c5dfd098dacc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 22 Jul 2024 21:03:37 +0000 Subject: [PATCH 21/26] Bump org.apache.maven.plugins:maven-javadoc-plugin from 3.7.0 to 3.8.0 Bumps [org.apache.maven.plugins:maven-javadoc-plugin](https://github.com/apache/maven-javadoc-plugin) from 3.7.0 to 3.8.0. - [Release notes](https://github.com/apache/maven-javadoc-plugin/releases) - [Commits](https://github.com/apache/maven-javadoc-plugin/compare/maven-javadoc-plugin-3.7.0...maven-javadoc-plugin-3.8.0) --- updated-dependencies: - dependency-name: org.apache.maven.plugins:maven-javadoc-plugin dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 8fd27509..d0b40101 100644 --- a/pom.xml +++ b/pom.xml @@ -74,7 +74,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.7.0 + 3.8.0 true @@ -112,7 +112,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.7.0 + 3.8.0 8 @@ -196,7 +196,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.7.0 + 3.8.0 8 From 818c9a2f5a5ea8dc72d3efa775f192381e84b8c1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 9 Aug 2024 21:53:01 +0000 Subject: [PATCH 22/26] Bump org.apache.pdfbox:pdfbox from 3.0.2 to 3.0.3 Bumps org.apache.pdfbox:pdfbox from 3.0.2 to 3.0.3. --- updated-dependencies: - dependency-name: org.apache.pdfbox:pdfbox dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index d0b40101..a963e35a 100644 --- a/pom.xml +++ b/pom.xml @@ -263,7 +263,7 @@ org.apache.pdfbox pdfbox - 3.0.2 + 3.0.3 From 5d91f1d733c4895d31854a641c152220f8c5f341 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 30 Aug 2024 21:39:59 +0000 Subject: [PATCH 23/26] Bump org.locationtech.jts:jts-core from 1.19.0 to 1.20.0 Bumps org.locationtech.jts:jts-core from 1.19.0 to 1.20.0. --- updated-dependencies: - dependency-name: org.locationtech.jts:jts-core dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index a963e35a..49057e90 100644 --- a/pom.xml +++ b/pom.xml @@ -245,7 +245,7 @@ org.locationtech.jts jts-core - 1.19.0 + 1.20.0 From 971ae765e84f09ed83f5808b66f764590146e923 Mon Sep 17 00:00:00 2001 From: Kyle Lacy Date: Thu, 20 Feb 2025 15:29:09 -0800 Subject: [PATCH 24/26] Upgrade BouncyCastle dependencies --- pom.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 49057e90..8b7b3b2d 100644 --- a/pom.xml +++ b/pom.xml @@ -268,14 +268,14 @@ org.bouncycastle - bcprov-jdk15on - 1.70 + bcprov-jdk18on + 1.80 org.bouncycastle - bcmail-jdk15on - 1.70 + bcmail-jdk18on + 1.80 From 88154e2c15967cc4c2a2606a8da25d47b9b916c3 Mon Sep 17 00:00:00 2001 From: Tilman Hausherr Date: Wed, 19 Mar 2025 15:36:11 +0100 Subject: [PATCH 25/26] Update PDFBox --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 8b7b3b2d..211d0d4d 100644 --- a/pom.xml +++ b/pom.xml @@ -263,7 +263,7 @@ org.apache.pdfbox pdfbox - 3.0.3 + 3.0.4 From 2cdf3b4fd3f7e921dca8cc6814cdd9316be40f0f Mon Sep 17 00:00:00 2001 From: Tilman Hausherr Date: Wed, 19 Mar 2025 15:38:16 +0100 Subject: [PATCH 26/26] Adjust test Test needs to be adjusted because PDFBox supports the /ActualText feature of PDFBox. --- src/test/java/technology/tabula/TestBasicExtractor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/technology/tabula/TestBasicExtractor.java b/src/test/java/technology/tabula/TestBasicExtractor.java index d120546f..b56fd6ea 100644 --- a/src/test/java/technology/tabula/TestBasicExtractor.java +++ b/src/test/java/technology/tabula/TestBasicExtractor.java @@ -203,7 +203,7 @@ public void testCheckSqueezeDoesntBreak() throws IOException { List> rows = table.getRows(); List firstRow = rows.get(0); List lastRow = rows.get(rows.size() - 1); - assertTrue(firstRow.get(0).getText().equals("Violent crime . . . . . . . . . . . . . . . . . .")); + assertTrue(firstRow.get(0).getText().equals("Violent crime. . . . . . . . . . . . . . . . . .")); assertTrue(lastRow.get(lastRow.size() - 1).getText().equals("(X)")); page.getPDDoc().close(); }