public void run(String[] args) throws ParseException, TransformationException, IOException, AnalysisException { Options options = new Options(); options.addOption("input", true, "input path"); options.addOption("output", true, "output path"); options.addOption("ext", true, "extension"); CommandLineParser parser = new DefaultParser(); CommandLine line = parser.parse(options, args); String inDir = line.getOptionValue("input"); String outDir = line.getOptionValue("output"); String extension = line.getOptionValue("ext"); File dir = new File(inDir); for (File f : FileUtils.listFiles(dir, new String[] {extension}, true)) { TrueVizToBxDocumentReader tvReader = new TrueVizToBxDocumentReader(); List<BxPage> pages = tvReader.read(new FileReader(f)); BxDocument doc = new BxDocument().setPages(pages); doc.setFilename(f.getName()); BxDocument rewritten = transform(doc); File f2 = new File(outDir + doc.getFilename()); BxDocumentToTrueVizWriter wrt = new BxDocumentToTrueVizWriter(); boolean created = f2.createNewFile(); if (!created) { throw new IOException("Cannot create file: "); } FileWriter fw = new FileWriter(f2); wrt.write(fw, Lists.newArrayList(rewritten)); fw.flush(); fw.close(); } }
@Override public BxDocument segmentDocument(BxDocument document) throws AnalysisException { Map<BxPage, List<Component>> componentMap = new HashMap<BxPage, List<Component>>(); ExecutorService exec = Executors.newFixedThreadPool(PdfNLMContentExtractor.THREADS_NUMBER); ArrayList<Callable<NumBxPage>> tasks = new ArrayList<Callable<NumBxPage>>(); for (BxPage page : document.getPages()) { tasks.add(new ComponentCounter(page)); } List<Future<NumBxPage>> results; try { results = exec.invokeAll(tasks); exec.shutdown(); for (Future<NumBxPage> result : results) { NumBxPage p = result.get(); componentMap.put(p.page, p.components); } } catch (ExecutionException ex) { throw new AnalysisException("Cannot segment pages!", ex); } catch (InterruptedException ex) { throw new AnalysisException("Cannot segment pages!", ex); } this.computeDocumentOrientation(componentMap); BxDocument output = new BxDocument(); BxPage[] pages = new BxPage[document.getPages().size()]; exec = Executors.newFixedThreadPool(PdfNLMContentExtractor.THREADS_NUMBER); tasks = new ArrayList<Callable<NumBxPage>>(); int i = 0; for (BxPage page : document.getPages()) { tasks.add(new SingleSegmenter(page, i++)); } try { results = exec.invokeAll(tasks); exec.shutdown(); for (Future<NumBxPage> result : results) { NumBxPage p = result.get(); pages[p.index] = p.page; } for (BxPage p : pages) { if (p.getBounds() != null) { output.addPage(p); } } return output; } catch (ExecutionException ex) { throw new AnalysisException("Cannot segment pages!", ex); } catch (InterruptedException ex) { throw new AnalysisException("Cannot segment pages!", ex); } }
@Override protected ClassificationResults compareDocuments(BxDocument expected, BxDocument actual) { ClassificationResults ret = newResults(); for (Integer idx = 0; idx < actual.asZones().size(); ++idx) { ClassificationResults itemResults = compareItems(expected.asZones().get(idx), actual.asZones().get(idx)); ret.add(itemResults); } return ret; }
@Override protected Results compareItems(BxDocument expected, BxDocument actual) { Results results = new Results(); for (int i = 0; i < expected.childrenCount(); i++) { BxPage expPage = expected.getChild(i); BxPage actPage = actual.getChild(i); results.zoneLevel.add(compareZones(expPage, actPage)); results.lineLevel.add(compareLines(expPage, actPage)); results.wordLevel.add(compareWords(expPage, actPage)); } return results; }
public static void main(String[] args) throws TransformationException, IOException, AnalysisException, ParseException, CloneNotSupportedException { Options options = new Options(); options.addOption("input", true, "input path"); options.addOption("output", true, "output path"); CommandLineParser parser = new GnuParser(); CommandLine line = parser.parse(options, args); String inDir = line.getOptionValue("input"); String outDir = line.getOptionValue("output"); File dir = new File(inDir); for (File f : FileUtils.listFiles(dir, new String[] {"xml"}, true)) { TrueVizToBxDocumentReader tvReader = new TrueVizToBxDocumentReader(); List<BxPage> pages = tvReader.read(new FileReader(f)); BxDocument doc = new BxDocument().setPages(pages); doc.setFilename(f.getName()); int all = 0; int good = 0; for (BxZone z : doc.asZones()) { all++; if (!z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) { good++; } } int intcov = 0; if (all > 0) { intcov = good * 100 / all; } System.out.println(doc.getFilename() + " " + intcov); File f2 = new File(outDir + doc.getFilename() + "." + intcov); FileUtils.copyFile(f, f2); } }
@Override protected void preprocessDocumentForEvaluation(BxDocument doc) { for (BxZone zone : doc.asZones()) zone.setLabel(zone.getLabel().getGeneralLabel()); }
public static void main(String[] args) throws ParseException, IOException, TransformationException, AnalysisException, CloneNotSupportedException { Options options = new Options(); CommandLineParser parser = new GnuParser(); CommandLine line = parser.parse(options, args); if (args.length != 2) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(" [-options] input-directory extension", options); System.exit(1); } String inputDirPath = line.getArgs()[0]; File inputDirFile = new File(inputDirPath); Integer docIdx = 0; HierarchicalReadingOrderResolver ror = new HierarchicalReadingOrderResolver(); DocumentsIterator iter = new DocumentsIterator(inputDirPath, line.getArgs()[1]); FeatureVectorBuilder<BxZone, BxPage> metaVectorBuilder = SVMMetadataZoneClassifier.getFeatureVectorBuilder(); FeatureVectorBuilder<BxZone, BxPage> initialVectorBuilder = SVMInitialZoneClassifier.getFeatureVectorBuilder(); SampleFilter metaSamplesFilter = new SampleFilter(BxZoneLabelCategory.CAT_METADATA); FileWriter initialStream = new FileWriter("initial_" + inputDirFile.getName() + ".dat"); BufferedWriter svmInitialFile = new BufferedWriter(initialStream); FileWriter metaStream = new FileWriter("meta_" + inputDirFile.getName() + ".dat"); BufferedWriter svmMetaFile = new BufferedWriter(metaStream); for (BxDocument doc : iter) { System.out.println(docIdx + ": " + doc.getFilename()); String filename = doc.getFilename(); doc = ror.resolve(doc); doc.setFilename(filename); for (BxZone zone : doc.asZones()) { if (zone.getLabel() != null) { if (zone.getLabel().getCategory() != BxZoneLabelCategory.CAT_METADATA) { zone.setLabel(zone.getLabel().getGeneralLabel()); } } else { zone.setLabel(BxZoneLabel.OTH_UNKNOWN); } } List<TrainingSample<BxZoneLabel>> newMetaSamples = BxDocsToTrainingSamplesConverter.getZoneTrainingSamples( doc, metaVectorBuilder, BxZoneLabel.getIdentityMap()); newMetaSamples = metaSamplesFilter.pickElements(newMetaSamples); List<TrainingSample<BxZoneLabel>> newInitialSamples = BxDocsToTrainingSamplesConverter.getZoneTrainingSamples( doc, initialVectorBuilder, BxZoneLabel.getLabelToGeneralMap()); for (TrainingSample<BxZoneLabel> sample : newMetaSamples) { toLibSVM(sample, svmMetaFile); } for (TrainingSample<BxZoneLabel> sample : newInitialSamples) { toLibSVM(sample, svmInitialFile); } ++docIdx; } svmInitialFile.close(); svmMetaFile.close(); }