private void appendZone(Document doc, Element parent, BxZone zone, Object... hints) throws TransformationException { Element node = doc.createElement("Zone"); appendPropertyIfNotNull(doc, node, "ZoneID", zone.getId()); appendBounds(doc, node, "ZoneCorners", zone.getBounds(), hints); appendPropertyIfNotNull(doc, node, "ZoneNext", zone.getNextId()); Element insetsNode = doc.createElement("ZoneInsets"); insetsNode.setAttribute("Top", ""); insetsNode.setAttribute("Bottom", ""); insetsNode.setAttribute("Left", ""); insetsNode.setAttribute("Right", ""); node.appendChild(insetsNode); appendProperty(doc, node, "ZoneLines", ""); if (zone.getLabel() != null) { if (ZONE_LABEL_MAP.get(zone.getLabel()) != null && !ZONE_LABEL_MAP.get(zone.getLabel()).isEmpty()) { appendClassification(doc, node, ZONE_LABEL_MAP.get(zone.getLabel()).toUpperCase(), ""); } else { throw new TransformationException("Writing down an unknown zone label: " + zone.getLabel()); } } for (BxLine line : zone.getLines()) { appendLine(doc, node, line, hints); } parent.appendChild(node); }
private LevelResults compareZones(BxPage expected, BxPage actual) { Map<BxChunk, BxZone> map = BxModelUtils.mapChunksToZones(actual); LevelResults results = new LevelResults(); for (BxZone expectedZone : expected) { if (ignoredLabels.contains(expectedZone.getLabel())) { continue; } Set<BxZone> actualZones = new HashSet<BxZone>(); for (BxLine line : expectedZone) { for (BxWord word : line) { for (BxChunk chunk : word) { actualZones.add(map.get(chunk)); } } } if (actualZones.size() == 1) { for (BxZone actualZone : actualZones) { if (BxModelUtils.countChunks(actualZone) == BxModelUtils.countChunks(expectedZone)) { results.matched++; } } } results.all++; } return results; }
private LevelResults compareWords(BxPage expected, BxPage actual) { Map<BxChunk, BxWord> map = BxModelUtils.mapChunksToWords(actual); LevelResults results = new LevelResults(); for (BxZone expectedZone : expected) { if (ignoredLabels.contains(expectedZone.getLabel())) { continue; } for (BxLine expectedLine : expectedZone) { for (BxWord expectedWord : expectedLine) { Set<BxWord> actualWords = new HashSet<BxWord>(); for (BxChunk chunk : expectedWord) { actualWords.add(map.get(chunk)); } if (actualWords.size() == 1) { for (BxWord actualWord : actualWords) { if (actualWord.childrenCount() == expectedWord.childrenCount()) { results.matched++; } } } results.all++; } } } return results; }
public static void main(String[] args) throws TransformationException, IOException, AnalysisException, ParseException, CloneNotSupportedException { Options options = new Options(); options.addOption("input", true, "input path"); options.addOption("output", true, "output path"); CommandLineParser parser = new GnuParser(); CommandLine line = parser.parse(options, args); String inDir = line.getOptionValue("input"); String outDir = line.getOptionValue("output"); File dir = new File(inDir); for (File f : FileUtils.listFiles(dir, new String[] {"xml"}, true)) { TrueVizToBxDocumentReader tvReader = new TrueVizToBxDocumentReader(); List<BxPage> pages = tvReader.read(new FileReader(f)); BxDocument doc = new BxDocument().setPages(pages); doc.setFilename(f.getName()); int all = 0; int good = 0; for (BxZone z : doc.asZones()) { all++; if (!z.getLabel().equals(BxZoneLabel.OTH_UNKNOWN)) { good++; } } int intcov = 0; if (all > 0) { intcov = good * 100 / all; } System.out.println(doc.getFilename() + " " + intcov); File f2 = new File(outDir + doc.getFilename() + "." + intcov); FileUtils.copyFile(f, f2); } }
@Override protected void preprocessDocumentForEvaluation(BxDocument doc) { for (BxZone zone : doc.asZones()) zone.setLabel(zone.getLabel().getGeneralLabel()); }
@Override public BxContentStructure extractHeaders(BxDocument document) throws AnalysisException { Population heightPopulation = new Population(); Population fontPopulation = new Population(); Population distancePopulation = new Population(); Population lengthPopulation = new Population(); Population indentationPopulation = new Population(); Set<BxLine> candidates = new HashSet<BxLine>(); for (BxPage page : document) { for (BxZone zone : page) { if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT) || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) { for (BxLine line : zone) { heightPopulation.addObservation(line.getHeight()); lengthPopulation.addObservation(line.getWidth()); indentationPopulation.addObservation(line.getX()); if (line.hasPrev() && line.getY() - line.getPrev().getY() > 0) { distancePopulation.addObservation(line.getY() - line.getPrev().getY()); } fontPopulation.addObservation(getFontIndex(line)); if (isFirstInZone(line) && looksLikeHeader(line)) { candidates.add(line); } } } } } Set<BxLine> toDelete = new HashSet<BxLine>(); for (BxLine line : candidates) { if (shouldBeRemoved( line, heightPopulation, fontPopulation, distancePopulation, indentationPopulation)) { toDelete.add(line); } if (lengthPopulation.getZScore(line.getWidth()) > candMaxLengthZScore) { toDelete.add(line); } } candidates.removeAll(toDelete); toDelete.clear(); Set<String> headerFonts = new HashSet<String>(); List<BxLine> candidatesList = Lists.newArrayList(candidates); for (int x = 0; x < candidatesList.size(); x++) { BxLine line1 = candidatesList.get(x); for (int y = x + 1; y < candidatesList.size(); y++) { BxLine line2 = candidatesList.get(y); for (int z = y + 1; z < candidatesList.size(); z++) { BxLine line3 = candidatesList.get(z); if (line1.getMostPopularFontName().equals(line2.getMostPopularFontName()) && line3.getMostPopularFontName().equals(line2.getMostPopularFontName()) && Math.abs(fontPopulation.getZScore(getFontIndex(line1))) > outlFontZScore) { headerFonts.add(line1.getMostPopularFontName()); } } } } for (BxPage page : document) { for (BxZone zone : page) { if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT) || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) { for (BxLine line : zone) { if (looksLikeHeader(line) && headerFonts.contains(line.getMostPopularFontName())) { candidates.add(line); } } } } } for (BxLine line : candidates) { if (shouldBeRemoved( line, heightPopulation, fontPopulation, distancePopulation, indentationPopulation)) { toDelete.add(line); } if (lengthPopulation.getZScore(line.getWidth()) > candMaxLengthZScore2) { toDelete.add(line); } } candidates.removeAll(toDelete); toDelete.clear(); for (BxLine line : candidates) { int i = 0; for (BxLine line2 : candidates) { if (line.equals(line2)) { continue; } if (areSimilar(line, line2)) { i++; } } if (i == 0 || i > maxSimilarLinesCount) { toDelete.add(line); for (BxLine line2 : candidates) { if (areSimilar(line, line2)) { toDelete.add(line2); } } } } candidates.removeAll(toDelete); candidatesList = new ArrayList<BxLine>(); for (BxPage page : document) { for (BxZone zone : page) { for (BxLine line : zone) { if (candidates.contains(line)) { candidatesList.add(line); } } } } int clusters[] = headersClusterizer.clusterLines(candidatesList); Set<Integer> keptClusters = new HashSet<Integer>(); for (int clusterIdx = 0; clusterIdx < clusters.length; clusterIdx++) { int cluster = clusters[clusterIdx]; if (keptClusters.size() < 3) { keptClusters.add(cluster); } if (!keptClusters.contains(cluster)) { candidates.remove(candidatesList.get(clusterIdx)); } } BxContentStructure contentStructure = new BxContentStructure(); BxLine lastHeaderLine = null; for (BxPage page : document) { for (BxZone zone : page) { if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT) || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) { for (BxLine line : zone) { if (candidates.contains(line)) { contentStructure.addFirstHeaderLine(page, line); lastHeaderLine = line; } else if (zone.getLabel().equals(BxZoneLabel.BODY_CONTENT) || zone.getLabel().equals(BxZoneLabel.GEN_BODY)) { if (lastHeaderLine == null) { BxChunk chunk = new BxChunk(new BxBounds(), "--"); BxWord word = new BxWord().addChunk(chunk); lastHeaderLine = new BxLine().addWord(word); contentStructure.addFirstHeaderLine(page, lastHeaderLine); } contentStructure.addContentLine(lastHeaderLine, line); } } } } } headerLinesCompletener.completeLines(contentStructure); return contentStructure; }
public static void main(String[] args) throws ParseException, IOException, TransformationException, AnalysisException, CloneNotSupportedException { Options options = new Options(); CommandLineParser parser = new GnuParser(); CommandLine line = parser.parse(options, args); if (args.length != 2) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(" [-options] input-directory extension", options); System.exit(1); } String inputDirPath = line.getArgs()[0]; File inputDirFile = new File(inputDirPath); Integer docIdx = 0; HierarchicalReadingOrderResolver ror = new HierarchicalReadingOrderResolver(); DocumentsIterator iter = new DocumentsIterator(inputDirPath, line.getArgs()[1]); FeatureVectorBuilder<BxZone, BxPage> metaVectorBuilder = SVMMetadataZoneClassifier.getFeatureVectorBuilder(); FeatureVectorBuilder<BxZone, BxPage> initialVectorBuilder = SVMInitialZoneClassifier.getFeatureVectorBuilder(); SampleFilter metaSamplesFilter = new SampleFilter(BxZoneLabelCategory.CAT_METADATA); FileWriter initialStream = new FileWriter("initial_" + inputDirFile.getName() + ".dat"); BufferedWriter svmInitialFile = new BufferedWriter(initialStream); FileWriter metaStream = new FileWriter("meta_" + inputDirFile.getName() + ".dat"); BufferedWriter svmMetaFile = new BufferedWriter(metaStream); for (BxDocument doc : iter) { System.out.println(docIdx + ": " + doc.getFilename()); String filename = doc.getFilename(); doc = ror.resolve(doc); doc.setFilename(filename); for (BxZone zone : doc.asZones()) { if (zone.getLabel() != null) { if (zone.getLabel().getCategory() != BxZoneLabelCategory.CAT_METADATA) { zone.setLabel(zone.getLabel().getGeneralLabel()); } } else { zone.setLabel(BxZoneLabel.OTH_UNKNOWN); } } List<TrainingSample<BxZoneLabel>> newMetaSamples = BxDocsToTrainingSamplesConverter.getZoneTrainingSamples( doc, metaVectorBuilder, BxZoneLabel.getIdentityMap()); newMetaSamples = metaSamplesFilter.pickElements(newMetaSamples); List<TrainingSample<BxZoneLabel>> newInitialSamples = BxDocsToTrainingSamplesConverter.getZoneTrainingSamples( doc, initialVectorBuilder, BxZoneLabel.getLabelToGeneralMap()); for (TrainingSample<BxZoneLabel> sample : newMetaSamples) { toLibSVM(sample, svmMetaFile); } for (TrainingSample<BxZoneLabel> sample : newInitialSamples) { toLibSVM(sample, svmInitialFile); } ++docIdx; } svmInitialFile.close(); svmMetaFile.close(); }