From 51c753dc07b8dd1353c5d74f88e73949d3c9f3dc Mon Sep 17 00:00:00 2001 From: Asma Adala Date: Tue, 26 Feb 2019 15:19:18 +0100 Subject: [PATCH] add test files --- .../technology/tabula/ObjectExtractor.java | 2 +- .../technology/tabula/UseCases/UseCase1.java | 194 ++++++++++++++++++ 2 files changed, 195 insertions(+), 1 deletion(-) create mode 100644 src/main/java/technology/tabula/UseCases/UseCase1.java diff --git a/src/main/java/technology/tabula/ObjectExtractor.java b/src/main/java/technology/tabula/ObjectExtractor.java index 87c2a2f9..75cb41ff 100644 --- a/src/main/java/technology/tabula/ObjectExtractor.java +++ b/src/main/java/technology/tabula/ObjectExtractor.java @@ -13,7 +13,7 @@ public ObjectExtractor(PDDocument pdfDocument) { this.pdfDocument = pdfDocument; } - protected Page extractPage(Integer pageNumber) throws IOException { + public Page extractPage(Integer pageNumber) throws IOException { if (pageNumber > this.pdfDocument.getNumberOfPages() || pageNumber < 1) { throw new java.lang.IndexOutOfBoundsException( diff --git a/src/main/java/technology/tabula/UseCases/UseCase1.java b/src/main/java/technology/tabula/UseCases/UseCase1.java new file mode 100644 index 00000000..fbef1516 --- /dev/null +++ b/src/main/java/technology/tabula/UseCases/UseCase1.java @@ -0,0 +1,194 @@ +package technology.tabula.UseCases; + + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import org.apache.pdfbox.pdmodel.PDDocument; +import technology.tabula.ObjectExtractor; +import technology.tabula.Page; +import technology.tabula.PageIterator; +import technology.tabula.Rectangle; +import technology.tabula.Table; +import technology.tabula.UtilsForTesting; +import technology.tabula.detectors.NurminenDetectionAlgorithm; +import technology.tabula.extractors.BasicExtractionAlgorithm; +import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; +import technology.tabula.writers.CSVWriter; +import technology.tabula.writers.JSONWriter; +import technology.tabula.writers.TSVWriter; + +public class UseCase1 { + + //static DocumentBuilder builder; + String output_csv_file="output_table_csv.csv"; + String output_tsv_file="output_table_tsv.csv"; + static String output_json_file="output_table_json.csv"; + static String pdfPath="/Users/asmaadala/git/tabula-java/src/main/resources/technology/tabula/UseCases/T100_Report.pdf"; + + // extract tables with NurminenDetectionAlgorithm() and return Map> DetectTables(File pdf) throws Exception { + + + // tabula extractors + PDDocument pdfDocument = PDDocument.load(pdf); + ObjectExtractor extractor = new ObjectExtractor(pdfDocument); + + // now find tables detected by tabula-java + Map> detectedTables = new HashMap<>(); + + // the algorithm we're going to use + NurminenDetectionAlgorithm detectionAlgorithm = new NurminenDetectionAlgorithm(); + + + PageIterator pages = extractor.extract(); + while (pages.hasNext()) { + Page page = pages.next(); + + List tablesOnPage = detectionAlgorithm.detect(page); + if (tablesOnPage.size() > 0) { + detectedTables.put(new Integer(page.getPageNumber()), tablesOnPage); + + } + } + return detectedTables; + } + + + private static void printTables(Map> tables) { + for (Integer page : tables.keySet()) { + System.out.println("Page " + page.toString()); + for (Rectangle table : tables.get(page)) { + System.out.println(table); + } + } + } + private static List getTables(Map> tables, String pdfPath) throws IOException { + + List
tableslist=new ArrayList
(); + + for (int page_number:tables.keySet() ) { + Page page = UtilsForTesting.getPage(pdfPath,page_number ); + BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); + Table table = bea.extract(page).get(page_number); + tableslist.add(table); + + } + return tableslist; + } + + + + //extract a table from a pdf page with BasicExtractionAlgorithm() + private Table getTable(Page page) throws IOException { + //Page page = UtilsForTesting.getAreaFromFirstPage("src/test/resources/technology/tabula/argentina_diputados_voting_record.pdf", 269.875f, 12.75f, 790.5f, 561f); + BasicExtractionAlgorithm bea = new BasicExtractionAlgorithm(); + Table table = bea.extract(page).get(0); + return table; + } + //extract all tables from pdf page with SpreadsheetExtractionAlgorithm() + private static List
getTables(Page page) throws IOException { + + //Page page = UtilsForTesting.getPage("src/test/resources/technology/tabula/twotables.pdf", 1); + SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm(); + + return sea.extract(page); + } + + + public void CSVWriter(Table table) throws IOException { + //String expectedCsv = UtilsForTesting.loadCsv("src/test/resources/technology/tabula/csv/argentina_diputados_voting_record.csv"); + //Table table = this.getTable(); + //StringBuilder sb = new StringBuilder(); + File file = new File(output_csv_file); + + BufferedWriter writer = new BufferedWriter(new FileWriter(file)); + //writer.write(sb.toString()); + //} + (new CSVWriter()).write(writer, table); + //(new CSVWriter()).write(sb, table); + //String s = sb.toString(); + //String[] lines = s.split("\\r?\\n"); + //assertEquals(lines[0], EXPECTED_CSV_WRITER_OUTPUT); + //assertEquals(expectedCsv, s); + } + + public void TSVWriter(Table table ) throws IOException { + + StringBuilder sb = new StringBuilder(); + (new TSVWriter()).write(sb, table); + //String s = sb.toString(); + File file = new File(output_tsv_file); + + try (BufferedWriter writer = new BufferedWriter(new FileWriter(file))) { + writer.write(sb.toString()); + } + + //System.out.println(s); + //String[] lines = s.split("\\r?\\n"); + //assertEquals(lines[0], EXPECTED_CSV_WRITER_OUTPUT); + } + + public static void JSONWrite(Table table) throws IOException { + //String expectedJson = UtilsForTesting.loadJson("src/test/resources/technology/tabula/json/argentina_diputados_voting_record.json"); + //Table table = this.getTable(); + StringBuilder sb = new StringBuilder(); + File file = new File(output_json_file); + + try (BufferedWriter writer = new BufferedWriter(new FileWriter(file))) { + writer.write(sb.toString()); + } + + (new JSONWriter()).write(sb, table); + //String s = sb.toString(); + //assertEquals(expectedJson, s); + } + + + + + + + + public static void main(String[] args) throws Exception { + + + File pdffile= new File(pdfPath); + /* + try { + printTables(DetectTables(pdffile)); + } catch (Exception e) { + + e.printStackTrace(); + } + */ + //Map> tablesmap = DetectTables(pdffile); + + //long i = 0; + + + List
listtables = getTables(DetectTables(pdffile), pdfPath); + + + for (Table t: listtables) + { + JSONWrite(t); + + } + + + + } + + + + } + + + +