private void assertOffsetConversionOk(String testData, String patStr) { // Build an entity at the location of patStr final Pattern pat = Pattern.compile(patStr); final Matcher matcher = pat.matcher(testData); List<Extractor.Entity> entities = new ArrayList<Extractor.Entity>(); List<Integer> codePointOffsets = new ArrayList<Integer>(); List<Integer> charOffsets = new ArrayList<Integer>(); while (matcher.find()) { final int charOffset = matcher.start(); charOffsets.add(charOffset); codePointOffsets.add(testData.codePointCount(0, charOffset)); entities.add(new Extractor.Entity(matcher, Extractor.Entity.Type.HASHTAG, 0, 0)); } extractor.modifyIndicesFromUTF16ToToUnicode(testData, entities); for (int i = 0; i < entities.size(); i++) { assertEquals(codePointOffsets.get(i), entities.get(i).getStart()); } extractor.modifyIndicesFromUnicodeToUTF16(testData, entities); for (int i = 0; i < entities.size(); i++) { // This assertion could fail if the entity location is in the middle // of a surrogate pair, since there is no equivalent code point // offset to that location. It would be pathological for an entity to // start at that point, so we can just let the test fail in that case. assertEquals(charOffsets.get(i), entities.get(i).getStart()); } }
@Test(groups = "functest") public void shouldGetQuotesMessages() { String path = "./target/test-classes/"; Properties props = PropertyHelper.loadProperties(path); Extractor generator = new InstapaperExtractor(props, "Citas", false, "10"); Set<Entry> messages = generator.extract(); assertNotNull(messages, "no messages are retrieved"); assertTrue(messages.size() > 0, "expected more messages"); for (Entry entry : messages) { System.out.println("url:" + entry.getUrl() + ".text:" + entry.getText()); } }
public void testUrlWithSpecialCCTLDWithoutProtocol() { String text = "MLB.tv vine.co"; assertList( "Failed to extract URLs without protocol", new String[] {"MLB.tv", "vine.co"}, extractor.extractURLs(text)); List<Extractor.Entity> extracted = extractor.extractURLsWithIndices(text); assertEquals(extracted.get(0).getStart().intValue(), 0); assertEquals(extracted.get(0).getEnd().intValue(), 6); assertEquals(extracted.get(1).getStart().intValue(), 7); assertEquals(extracted.get(1).getEnd().intValue(), 14); extractor.setExtractURLWithoutProtocol(false); assertTrue("Should not extract URLs w/o protocol", extractor.extractURLs(text).isEmpty()); }
public void setProject(IProject project) { super.setProject(project); if (project == null) { isGAC = true; } dllDirs = new HashSet(); }
/** * Auto-link hashtags, URLs, usernames and lists. * * @param text of the Tweet to auto-link * @return text with auto-link HTML added */ public String autoLink(String text) { text = escapeBrackets(text); // extract entities List<Entity> entities = extractor.extractEntitiesWithIndices(text); return autoLinkEntities(text, entities); }
public String autoLinkAll( String text, TwitterMediaEntity twitterMediaEntity, MediaEntity[] mediaEntities, URLEntity[] urlEntities) { text = escapeBrackets(text); // extract entities List<Entity> entities = extractor.extractEntitiesWithIndices(text); return autoLinkEntities(text, twitterMediaEntity, entities, mediaEntities, urlEntities); }
public static void connect(String configFilename) throws SQLException, IOException, JSONException { // Read the configuration file JSONConfig.read(configFilename); // Connect to the database and instantiate the necessary extractors connection = Extractor.connect(); // And then prepare all the extractors for use for (Class<? extends Extractor> cls : JSONConfig.getExtractorClasses()) { Extractor extractor = Extractor.getExtractor(cls); try { extractor.prepare(); } catch (SQLException e) { System.err.println( "Cannot extract the information of the ontologies with the instance of " + extractor.getClass()); e.printStackTrace(); } } }
public AutolinkEx() { urlClass = null; listClass = DEFAULT_LIST_CLASS; usernameClass = DEFAULT_USERNAME_CLASS; hashtagClass = DEFAULT_HASHTAG_CLASS; cashtagClass = DEFAULT_CASHTAG_CLASS; usernameUrlBase = DEFAULT_USERNAME_URL_BASE; listUrlBase = DEFAULT_LIST_URL_BASE; hashtagUrlBase = DEFAULT_HASHTAG_URL_BASE; cashtagUrlBase = DEFAULT_CASHTAG_URL_BASE; invisibleTagAttrs = DEFAULT_INVISIBLE_TAG_ATTRS; extractor.setExtractURLWithoutProtocol(false); }
@Override public String[] getCommandExtractor(String url, String type) { String username = PreferencesManager.INSTANCE.getDatabaseUsername(); String password = PreferencesManager.INSTANCE.getDatabasePassword(); String host = PreferencesManager.INSTANCE.getDatabaseHost(); String database = PreferencesManager.INSTANCE.getDatabaseNameSCM(); String repo; if (type.equals("git")) { try { repo = SourcesManager.INSTANCE.getDownloadSourcePath(url); } catch (SourcesManagerError e) { repo = null; } } else { repo = url; } String[] cmd = { SCM_EXTRACTOR, "-u", username, "-p", password, "-d", database, "-H", host, "--extensions", "Metrics", "--metrics-all", repo }; super.logger.debug("Command: " + cmd.toString()); return cmd; }
public void createInstances(String file, Instances is) throws Exception { CONLLReader09 depReader = new CONLLReader09(file); mf.register(REL, "<root-type>"); // register at least one predicate since the parsing data might not contain predicates as in // the Japaness corpus but the development sets contains some long sl = 0; System.out.print("Registering feature parts of sentence: "); int ic = 0; int del = 0; while (true) { SentenceData09 instance = depReader.getNext(); if (instance == null) break; ic++; sl += instance.labels.length; if (ic % 1000 == 0) { del = outValue(ic, del); } String[] labs1 = instance.labels; for (int i1 = 0; i1 < labs1.length; i1++) mf.register(REL, labs1[i1]); String[] w = instance.forms; for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1])); w = instance.plemmas; for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1])); w = instance.ppos; for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); w = instance.gpos; for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]); if (instance.feats != null) { String fs[][] = instance.feats; for (int i1 = 0; i1 < fs.length; i1++) { w = fs[i1]; if (w == null) continue; for (int i2 = 0; i2 < w.length; i2++) mf.register(FEAT, w[i2]); } } if ((ic - 1) > options.count) break; } del = outValue(ic, del); System.out.println(); Extractor.initFeatures(); Extractor.maxForm = mf.getFeatureCounter().get(WORD); if (options.clusterFile == null) cl = new Cluster(); else cl = new Cluster(options.clusterFile, mf, 6); mf.calculateBits(); Extractor.initStat(options.featureCreation); System.out.println("" + mf.toString()); for (Extractor e : extractor) e.init(); depReader.startReading(file); int num1 = 0; is.init(ic, new MFO()); Edges.init(mf.getFeatureCounter().get(POS)); System.out.print("Creating edge filters and read corpus: "); del = 0; while (true) { if (num1 % 100 == 0) del = outValue(num1, del); SentenceData09 instance1 = depReader.getNext(is); if (instance1 == null) break; int last = is.size() - 1; short[] pos = is.pposs[last]; for (int k = 0; k < is.length(last); k++) { if (is.heads[last][k] < 0) continue; Edges.put(pos[is.heads[last][k]], pos[k], is.labels[last][k]); // Edges.put(pos[k],pos[is.heads[last][k]], is.labels[last][k]); } if (!options.allFeatures && num1 > options.count) break; num1++; } del = outValue(num1, del); System.out.println(); Edges.findDefault(); }
@Override public Object call() { try { F2SF f = para; short[] pos = is.pposs[i]; int length = pos.length; long[] gvs = new long[50]; long[] svs = new long[220]; while (true) { DSet set = get(); if (set == null) { break; } int w1 = set.w1; int w2 = set.w2; f.clear(); extractor.basic(pos, w1, w2, f); d.pl[w1][w2] = f.getScoreF(); f.clear(); extractor.basic(pos, w2, w1, f); d.pl[w2][w1] = f.getScoreF(); short[] labels = Edges.get(pos[w1], pos[w2]); float[] lab = d.lab[w1][w2]; final Long2IntInterface li = extractor.li; int c = extractor.firstm(is, i, w1, w2, 0, cluster, svs); for (int l = 0; l < lab.length; l++) { lab[l] = -100; } for (int l = 0; l < labels.length; l++) { short label = labels[l]; f.clear(); int lv = extractor.d0.computeLabeValue(label, Extractor.s_type); for (int k = 0; k < c; k++) { if (svs[k] > 0) { f.add(li.l2i(svs[k] + lv)); } } lab[label] = f.getScoreF(); } labels = Edges.get(pos[w2], pos[w1]); lab = d.lab[w2][w1]; for (int l = 0; l < lab.length; l++) { lab[l] = -100; } for (int l = 0; l < labels.length; l++) { int label = labels[l]; f.clear(); int lv = extractor.d0.computeLabeValue(label + Extractor.s_rel1, Extractor.s_type); for (int k = 0; k < c; k++) { if (svs[k] > 0) { f.add(li.l2i(svs[k] + lv)); } } lab[label] = f.getScoreF(); } int s = w1 < w2 ? w1 : w2; int e = w1 < w2 ? w2 : w1; for (int m = 0; m < length; m++) { int g = (m == s || e == m) ? -1 : m; int cn = extractor.second(is, i, w1, w2, g, 0, cluster, svs); int cc = extractor.addClusterFeatures(is, i, w1, w2, g, cluster, 0, gvs, 0); // for(int k=0;k<c;k++) dl1.map(f,svs[k]); if (m >= w1) { labels = Edges.get(pos[w1], pos[w2]); float[] lab2 = new float[labels.length]; for (int l = 0; l < labels.length; l++) { short label = labels[l]; int lx = label + Extractor.s_rel1 * (g < w2 ? 0 : 2); f.clear(); int lv = extractor.d0.computeLabeValue(lx, Extractor.s_type); for (int k = 0; k < cn; k++) { if (svs[k] > 0) { f.add(li.l2i(svs[k] + lv)); } } for (int k = 0; k < cc; k++) { if (gvs[k] > 0) { f.add(li.l2i(gvs[k] + lv)); } } lab2[l] = f.getScoreF(); } d.gra[w1][w2][m] = lab2; } if (m <= w2) { labels = Edges.get(pos[w2], pos[w1]); float lab2[]; d.gra[w2][w1][m] = lab2 = new float[labels.length]; for (int l = 0; l < labels.length; l++) { int label = labels[l]; int lx = label + Extractor.s_rel1 * (1 + (g < w1 ? 0 : 2)); f.clear(); int lv = extractor.d0.computeLabeValue(lx, Extractor.s_type); for (int k = 0; k < cn; k++) { if (svs[k] > 0) { f.add(li.l2i(svs[k] + lv)); } } for (int k = 0; k < cc; k++) { if (gvs[k] > 0) { f.add(li.l2i(gvs[k] + lv)); } } lab2[l] = f.getScoreF(); } } g = (m == s || e == m) ? -1 : m; // int cn = extractor.second(is,i,w1,w2,g,0, cluster, svs,Extractor._SIB); if (m >= w1 && m <= w2) { labels = Edges.get(pos[w1], pos[w2]); float lab2[] = new float[labels.length]; d.sib[w1][w2][m] = lab2; for (int l = 0; l < labels.length; l++) { short label = labels[l]; int lx = label + Extractor.s_rel1 * (8); f.clear(); int lv = extractor.d0.computeLabeValue(lx, Extractor.s_type); for (int k = 0; k < cn; k++) { if (svs[k] > 0) { f.add(li.l2i(svs[k] + lv)); } } for (int k = 0; k < cc; k++) { if (gvs[k] > 0) { f.add(li.l2i(gvs[k] + lv)); } } lab2[l] = (float) f.score; // f.getScoreF(); } } if (m >= w1 && m <= w2) { labels = Edges.get(pos[w2], pos[w1]); float[] lab2 = new float[labels.length]; d.sib[w2][w1][m] = lab2; for (int l = 0; l < labels.length; l++) { int label = labels[l]; int lx = label + Extractor.s_rel1 * (9); f.clear(); int lv = extractor.d0.computeLabeValue(lx, Extractor.s_type); for (int k = 0; k < cn; k++) { if (svs[k] > 0) { f.add(li.l2i(svs[k] + lv)); } } for (int k = 0; k < cc; k++) { if (gvs[k] > 0) { f.add(li.l2i(gvs[k] + lv)); } } lab2[l] = f.score; // f.getScoreF(); } } } } } catch (Exception e) { e.printStackTrace(); } return null; }
public boolean cancelJob() { cancelSignalled = true; return extractor.cancelExtraction(); }
/** * Auto-link $cashtag references in the provided Tweet text. The $cashtag links will have the * cashtagClass CSS class added. * * @param text of the Tweet to auto-link * @return text with auto-link HTML added */ public String autoLinkCashtags(String text) { return autoLinkEntities(text, extractor.extractCashtagsWithIndices(text)); }
/** * Auto-link URLs in the Tweet text provided. * * <p>This only auto-links URLs with protocol. * * @param text of the Tweet to auto-link * @return text with auto-link HTML added */ public String autoLinkURLs(String text) { return autoLinkEntities(text, extractor.extractURLsWithIndices(text)); }
/** * Auto-link the @username and @username/list references in the provided text. Links to @username * references will have the usernameClass CSS classes added. Links to @username/list references * will have the listClass CSS class added. * * @param text of the Tweet to auto-link * @return text with auto-link HTML added */ public String autoLinkUsernamesAndLists(String text) { return autoLinkEntities(text, extractor.extractMentionsOrListsWithIndices(text)); }
public void extractToFiles( File outDir, ExtractionObserver obs, boolean allowCancel, boolean convertData) throws IOException { cancelSignalled = false; if (outDir.exists() && !outDir.isDirectory()) { throw new IllegalArgumentException(outDir.getAbsolutePath() + " is not a directory"); } LodEntry currentEntry; if (!outDir.exists()) { outDir.mkdirs(); obs.directoryCreated(outDir); } while (entriesToExtractIterator.hasNext() && !(allowCancel && cancelSignalled)) { currentEntry = (LodEntry) entriesToExtractIterator.next(); String filename = currentEntry.getFileName(); FormatConverter converter = null; if (convertData) converter = currentEntry.getFormatConverter(); else { converter = new NullFormatConverter(); filename = filename + ".raw"; } String identifier = currentEntry.getName(); if (converter.requiresMultipleStreams()) { // IMPLEMENT: replace basefilename String baseFilename = filename; String[] filenames = converter.getSuggestedFilenames(baseFilename); OutputStream[] outputStreamArray = new OutputStream[filenames.length]; for (int i = 0; i < outputStreamArray.length; ++i) { outputStreamArray[i] = new BufferedOutputStream(new FileOutputStream(new File(outDir, filenames[i]))); } converter.setDestinationOutputStreamsForNewFormat(outputStreamArray, currentEntry); } else { converter.setDestinationOutputStreamForNewFormat( new BufferedOutputStream(new FileOutputStream(new File(outDir, filename))), currentEntry); } Extractor extractor = new Extractor(); extractor.convert( identifier, currentEntry.getData(), converter.getSourceOutputStreamForOldFormat(), (null != obs) ? new EntryObserver(obs, total) : null, allowCancel); if (null != obs) { obs.extractionProgress(identifier, counter++ / total); } } if (obs != null) { obs.extractionFinished("Done"); } }
public static void closeConnection() { Extractor.closeConnection(); }
private ObjectDataExtractor(final String odeFolderPath, final String mapPath) throws Exception { Log.entry(odeFolderPath, mapPath); final MPQArchive map = new MPQArchive(mapPath); final File odeFolder = new File(odeFolderPath); final File jFile = new File(odeFolder, FILE_NAME_SCRIPT); final File wtsFile = new File(odeFolder, FILE_NAME_WTS); final File w3uFile = new File(odeFolder, FILE_NAME_W3U); final File w3tFile = new File(odeFolder, FILE_NAME_W3T); final File w3bFile = new File(odeFolder, FILE_NAME_W3B); map.extractFile(FILE_NAME_SCRIPT, jFile); map.extractFile(FILE_NAME_WTS, wtsFile); Log.info("Loading JASS into memory"); String scriptContent = readFileToString(jFile.getPath(), StandardCharsets.UTF_8); Log.info("Loading WTS from file"); final WTSFile wts = new WTSFile(wtsFile.getPath()); final List<Extractor<?, ?>> extractors = new LinkedList<Extractor<?, ?>>(); // ------------------------------------------------------- // Unit Data Extractor extractors.add( new UnitDataExtractor( odeFolder, loadW3OFile(W3UFile.class, map, FILE_NAME_W3U, w3uFile, wts))); // ------------------------------------------------------- // Item Data Extractor extractors.add( new ItemDataExtractor( odeFolder, loadW3OFile(W3TFile.class, map, FILE_NAME_W3T, w3tFile, wts))); // ------------------------------------------------------- // Destructable Data Extractor extractors.add( new DestructableDataExtractor( odeFolder, loadW3OFile(W3BFile.class, map, FILE_NAME_W3B, w3bFile, wts))); for (final Extractor<?, ?> extractor : extractors) { scriptContent = extractor.processScript(scriptContent); } Log.info("Writing new script file to disk"); final FileWriter fw = new FileWriter(jFile); fw.write(scriptContent); fw.close(); Log.info("Replacing script file in map"); final MPQFile file = new MPQFile(map, FILE_NAME_SCRIPT, MPQFileOpenScope.MPQ); file.removeFromArchive(); file.close(); final MPQCompressionFlags compr = new MPQCompressionFlags(); compr.setCompression(Compression.BZIP2); map.addFile(jFile.getAbsolutePath(), FILE_NAME_SCRIPT, MPQFileFlags.fromInteger(0x200), compr); map.compactArchive((String) null); map.close(); Log.exit(); }
public static void main(String[] args) { Options op = new Options(new EnglishTreebankParserParams()); // op.tlpParams may be changed to something else later, so don't use it till // after options are parsed. System.out.println(StringUtils.toInvocationString("FactoredParser", args)); String path = "/u/nlp/stuff/corpora/Treebank3/parsed/mrg/wsj"; int trainLow = 200, trainHigh = 2199, testLow = 2200, testHigh = 2219; String serializeFile = null; int i = 0; while (i < args.length && args[i].startsWith("-")) { if (args[i].equalsIgnoreCase("-path") && (i + 1 < args.length)) { path = args[i + 1]; i += 2; } else if (args[i].equalsIgnoreCase("-train") && (i + 2 < args.length)) { trainLow = Integer.parseInt(args[i + 1]); trainHigh = Integer.parseInt(args[i + 2]); i += 3; } else if (args[i].equalsIgnoreCase("-test") && (i + 2 < args.length)) { testLow = Integer.parseInt(args[i + 1]); testHigh = Integer.parseInt(args[i + 2]); i += 3; } else if (args[i].equalsIgnoreCase("-serialize") && (i + 1 < args.length)) { serializeFile = args[i + 1]; i += 2; } else if (args[i].equalsIgnoreCase("-tLPP") && (i + 1 < args.length)) { try { op.tlpParams = (TreebankLangParserParams) Class.forName(args[i + 1]).newInstance(); } catch (ClassNotFoundException e) { System.err.println("Class not found: " + args[i + 1]); throw new RuntimeException(e); } catch (InstantiationException e) { System.err.println("Couldn't instantiate: " + args[i + 1] + ": " + e.toString()); throw new RuntimeException(e); } catch (IllegalAccessException e) { System.err.println("illegal access" + e); throw new RuntimeException(e); } i += 2; } else if (args[i].equals("-encoding")) { // sets encoding for TreebankLangParserParams op.tlpParams.setInputEncoding(args[i + 1]); op.tlpParams.setOutputEncoding(args[i + 1]); i += 2; } else { i = op.setOptionOrWarn(args, i); } } // System.out.println(tlpParams.getClass()); TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack(); op.trainOptions.sisterSplitters = new HashSet<String>(Arrays.asList(op.tlpParams.sisterSplitters())); // BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams); PrintWriter pw = op.tlpParams.pw(); op.testOptions.display(); op.trainOptions.display(); op.display(); op.tlpParams.display(); // setup tree transforms Treebank trainTreebank = op.tlpParams.memoryTreebank(); MemoryTreebank testTreebank = op.tlpParams.testMemoryTreebank(); // Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank(); // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/"; // blippTreebank.loadPath(blippPath, "", true); Timing.startTime(); System.err.print("Reading trees..."); testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true)); if (op.testOptions.increasingLength) { Collections.sort(testTreebank, new TreeLengthComparator()); } trainTreebank.loadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true)); Timing.tick("done."); System.err.print("Binarizing trees..."); TreeAnnotatorAndBinarizer binarizer; if (!op.trainOptions.leftToRight) { binarizer = new TreeAnnotatorAndBinarizer( op.tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op); } else { binarizer = new TreeAnnotatorAndBinarizer( op.tlpParams.headFinder(), new LeftHeadFinder(), op.tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op); } CollinsPuncTransformer collinsPuncTransformer = null; if (op.trainOptions.collinsPunc) { collinsPuncTransformer = new CollinsPuncTransformer(tlp); } TreeTransformer debinarizer = new Debinarizer(op.forceCNF); List<Tree> binaryTrainTrees = new ArrayList<Tree>(); if (op.trainOptions.selectiveSplit) { op.trainOptions.splitters = ParentAnnotationStats.getSplitCategories( trainTreebank, op.trainOptions.tagSelectiveSplit, 0, op.trainOptions.selectiveSplitCutOff, op.trainOptions.tagSelectiveSplitCutOff, op.tlpParams.treebankLanguagePack()); if (op.trainOptions.deleteSplitters != null) { List<String> deleted = new ArrayList<String>(); for (String del : op.trainOptions.deleteSplitters) { String baseDel = tlp.basicCategory(del); boolean checkBasic = del.equals(baseDel); for (Iterator<String> it = op.trainOptions.splitters.iterator(); it.hasNext(); ) { String elem = it.next(); String baseElem = tlp.basicCategory(elem); boolean delStr = checkBasic && baseElem.equals(baseDel) || elem.equals(del); if (delStr) { it.remove(); deleted.add(elem); } } } System.err.println("Removed from vertical splitters: " + deleted); } } if (op.trainOptions.selectivePostSplit) { TreeTransformer myTransformer = new TreeAnnotator(op.tlpParams.headFinder(), op.tlpParams, op); Treebank annotatedTB = trainTreebank.transform(myTransformer); op.trainOptions.postSplitters = ParentAnnotationStats.getSplitCategories( annotatedTB, true, 0, op.trainOptions.selectivePostSplitCutOff, op.trainOptions.tagSelectivePostSplitCutOff, op.tlpParams.treebankLanguagePack()); } if (op.trainOptions.hSelSplit) { binarizer.setDoSelectiveSplit(false); for (Tree tree : trainTreebank) { if (op.trainOptions.collinsPunc) { tree = collinsPuncTransformer.transformTree(tree); } // tree.pennPrint(tlpParams.pw()); tree = binarizer.transformTree(tree); // binaryTrainTrees.add(tree); } binarizer.setDoSelectiveSplit(true); } for (Tree tree : trainTreebank) { if (op.trainOptions.collinsPunc) { tree = collinsPuncTransformer.transformTree(tree); } tree = binarizer.transformTree(tree); binaryTrainTrees.add(tree); } if (op.testOptions.verbose) { binarizer.dumpStats(); } List<Tree> binaryTestTrees = new ArrayList<Tree>(); for (Tree tree : testTreebank) { if (op.trainOptions.collinsPunc) { tree = collinsPuncTransformer.transformTree(tree); } tree = binarizer.transformTree(tree); binaryTestTrees.add(tree); } Timing.tick("done."); // binarization BinaryGrammar bg = null; UnaryGrammar ug = null; DependencyGrammar dg = null; // DependencyGrammar dgBLIPP = null; Lexicon lex = null; Index<String> stateIndex = new HashIndex<String>(); // extract grammars Extractor<Pair<UnaryGrammar, BinaryGrammar>> bgExtractor = new BinaryGrammarExtractor(op, stateIndex); // Extractor bgExtractor = new SmoothedBinaryGrammarExtractor();//new BinaryGrammarExtractor(); // Extractor lexExtractor = new LexiconExtractor(); // Extractor dgExtractor = new DependencyMemGrammarExtractor(); if (op.doPCFG) { System.err.print("Extracting PCFG..."); Pair<UnaryGrammar, BinaryGrammar> bgug = null; if (op.trainOptions.cheatPCFG) { List<Tree> allTrees = new ArrayList<Tree>(binaryTrainTrees); allTrees.addAll(binaryTestTrees); bgug = bgExtractor.extract(allTrees); } else { bgug = bgExtractor.extract(binaryTrainTrees); } bg = bgug.second; bg.splitRules(); ug = bgug.first; ug.purgeRules(); Timing.tick("done."); } System.err.print("Extracting Lexicon..."); Index<String> wordIndex = new HashIndex<String>(); Index<String> tagIndex = new HashIndex<String>(); lex = op.tlpParams.lex(op, wordIndex, tagIndex); lex.train(binaryTrainTrees); Timing.tick("done."); if (op.doDep) { System.err.print("Extracting Dependencies..."); binaryTrainTrees.clear(); Extractor<DependencyGrammar> dgExtractor = new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex); // dgBLIPP = (DependencyGrammar) dgExtractor.extract(new // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new // TransformTreeDependency(tlpParams,true)); // DependencyGrammar dg1 = dgExtractor.extract(trainTreebank.iterator(), new // TransformTreeDependency(op.tlpParams, true)); // dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new // TransformTreeDependency(tlpParams)); // dg = (DependencyGrammar) dgExtractor.extract(new // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new // TransformTreeDependency(tlpParams)); // dg=new DependencyGrammarCombination(dg1,dgBLIPP,2); dg = dgExtractor.extract( binaryTrainTrees); // uses information whether the words are known or not, discards // unknown words Timing.tick("done."); // System.out.print("Extracting Unknown Word Model..."); // UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees); // Timing.tick("done."); System.out.print("Tuning Dependency Model..."); dg.tune(binaryTestTrees); // System.out.println("TUNE DEPS: "+tuneDeps); Timing.tick("done."); } BinaryGrammar boundBG = bg; UnaryGrammar boundUG = ug; GrammarProjection gp = new NullGrammarProjection(bg, ug); // serialization if (serializeFile != null) { System.err.print("Serializing parser..."); LexicalizedParser.saveParserDataToSerialized( new ParserData(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op), serializeFile); Timing.tick("done."); } // test: pcfg-parse and output ExhaustivePCFGParser parser = null; if (op.doPCFG) { parser = new ExhaustivePCFGParser(boundBG, boundUG, lex, op, stateIndex, wordIndex, tagIndex); } ExhaustiveDependencyParser dparser = ((op.doDep && !op.testOptions.useFastFactored) ? new ExhaustiveDependencyParser(dg, lex, op, wordIndex, tagIndex) : null); Scorer scorer = (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser, gp, op), dparser) : null); // Scorer scorer = parser; BiLexPCFGParser bparser = null; if (op.doPCFG && op.doDep) { bparser = (op.testOptions.useN5) ? new BiLexPCFGParser.N5BiLexPCFGParser( scorer, parser, dparser, bg, ug, dg, lex, op, gp, stateIndex, wordIndex, tagIndex) : new BiLexPCFGParser( scorer, parser, dparser, bg, ug, dg, lex, op, gp, stateIndex, wordIndex, tagIndex); } Evalb pcfgPE = new Evalb("pcfg PE", true); Evalb comboPE = new Evalb("combo PE", true); AbstractEval pcfgCB = new Evalb.CBEval("pcfg CB", true); AbstractEval pcfgTE = new TaggingEval("pcfg TE"); AbstractEval comboTE = new TaggingEval("combo TE"); AbstractEval pcfgTEnoPunct = new TaggingEval("pcfg nopunct TE"); AbstractEval comboTEnoPunct = new TaggingEval("combo nopunct TE"); AbstractEval depTE = new TaggingEval("depnd TE"); AbstractEval depDE = new UnlabeledAttachmentEval("depnd DE", true, null, tlp.punctuationWordRejectFilter()); AbstractEval comboDE = new UnlabeledAttachmentEval("combo DE", true, null, tlp.punctuationWordRejectFilter()); if (op.testOptions.evalb) { EvalbFormatWriter.initEVALBfiles(op.tlpParams); } // int[] countByLength = new int[op.testOptions.maxLength+1]; // Use a reflection ruse, so one can run this without needing the // tagger. Using a function rather than a MaxentTagger means we // can distribute a version of the parser that doesn't include the // entire tagger. Function<List<? extends HasWord>, ArrayList<TaggedWord>> tagger = null; if (op.testOptions.preTag) { try { Class[] argsClass = {String.class}; Object[] arguments = new Object[] {op.testOptions.taggerSerializedFile}; tagger = (Function<List<? extends HasWord>, ArrayList<TaggedWord>>) Class.forName("edu.stanford.nlp.tagger.maxent.MaxentTagger") .getConstructor(argsClass) .newInstance(arguments); } catch (Exception e) { System.err.println(e); System.err.println("Warning: No pretagging of sentences will be done."); } } for (int tNum = 0, ttSize = testTreebank.size(); tNum < ttSize; tNum++) { Tree tree = testTreebank.get(tNum); int testTreeLen = tree.yield().size(); if (testTreeLen > op.testOptions.maxLength) { continue; } Tree binaryTree = binaryTestTrees.get(tNum); // countByLength[testTreeLen]++; System.out.println("-------------------------------------"); System.out.println("Number: " + (tNum + 1)); System.out.println("Length: " + testTreeLen); // tree.pennPrint(pw); // System.out.println("XXXX The binary tree is"); // binaryTree.pennPrint(pw); // System.out.println("Here are the tags in the lexicon:"); // System.out.println(lex.showTags()); // System.out.println("Here's the tagnumberer:"); // System.out.println(Numberer.getGlobalNumberer("tags").toString()); long timeMil1 = System.currentTimeMillis(); Timing.tick("Starting parse."); if (op.doPCFG) { // System.err.println(op.testOptions.forceTags); if (op.testOptions.forceTags) { if (tagger != null) { // System.out.println("Using a tagger to set tags"); // System.out.println("Tagged sentence as: " + // tagger.processSentence(cutLast(wordify(binaryTree.yield()))).toString(false)); parser.parse(addLast(tagger.apply(cutLast(wordify(binaryTree.yield()))))); } else { // System.out.println("Forcing tags to match input."); parser.parse(cleanTags(binaryTree.taggedYield(), tlp)); } } else { // System.out.println("XXXX Parsing " + binaryTree.yield()); parser.parse(binaryTree.yieldHasWord()); } // Timing.tick("Done with pcfg phase."); } if (op.doDep) { dparser.parse(binaryTree.yieldHasWord()); // Timing.tick("Done with dependency phase."); } boolean bothPassed = false; if (op.doPCFG && op.doDep) { bothPassed = bparser.parse(binaryTree.yieldHasWord()); // Timing.tick("Done with combination phase."); } long timeMil2 = System.currentTimeMillis(); long elapsed = timeMil2 - timeMil1; System.err.println("Time: " + ((int) (elapsed / 100)) / 10.00 + " sec."); // System.out.println("PCFG Best Parse:"); Tree tree2b = null; Tree tree2 = null; // System.out.println("Got full best parse..."); if (op.doPCFG) { tree2b = parser.getBestParse(); tree2 = debinarizer.transformTree(tree2b); } // System.out.println("Debinarized parse..."); // tree2.pennPrint(); // System.out.println("DepG Best Parse:"); Tree tree3 = null; Tree tree3db = null; if (op.doDep) { tree3 = dparser.getBestParse(); // was: but wrong Tree tree3db = debinarizer.transformTree(tree2); tree3db = debinarizer.transformTree(tree3); tree3.pennPrint(pw); } // tree.pennPrint(); // ((Tree)binaryTrainTrees.get(tNum)).pennPrint(); // System.out.println("Combo Best Parse:"); Tree tree4 = null; if (op.doPCFG && op.doDep) { try { tree4 = bparser.getBestParse(); if (tree4 == null) { tree4 = tree2b; } } catch (NullPointerException e) { System.err.println("Blocked, using PCFG parse!"); tree4 = tree2b; } } if (op.doPCFG && !bothPassed) { tree4 = tree2b; } // tree4.pennPrint(); if (op.doDep) { depDE.evaluate(tree3, binaryTree, pw); depTE.evaluate(tree3db, tree, pw); } TreeTransformer tc = op.tlpParams.collinizer(); TreeTransformer tcEvalb = op.tlpParams.collinizerEvalb(); if (op.doPCFG) { // System.out.println("XXXX Best PCFG was: "); // tree2.pennPrint(); // System.out.println("XXXX Transformed best PCFG is: "); // tc.transformTree(tree2).pennPrint(); // System.out.println("True Best Parse:"); // tree.pennPrint(); // tc.transformTree(tree).pennPrint(); pcfgPE.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw); pcfgCB.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw); Tree tree4b = null; if (op.doDep) { comboDE.evaluate((bothPassed ? tree4 : tree3), binaryTree, pw); tree4b = tree4; tree4 = debinarizer.transformTree(tree4); if (op.nodePrune) { NodePruner np = new NodePruner(parser, debinarizer); tree4 = np.prune(tree4); } // tree4.pennPrint(); comboPE.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw); } // pcfgTE.evaluate(tree2, tree); pcfgTE.evaluate(tcEvalb.transformTree(tree2), tcEvalb.transformTree(tree), pw); pcfgTEnoPunct.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw); if (op.doDep) { comboTE.evaluate(tcEvalb.transformTree(tree4), tcEvalb.transformTree(tree), pw); comboTEnoPunct.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw); } System.out.println("PCFG only: " + parser.scoreBinarizedTree(tree2b, 0)); // tc.transformTree(tree2).pennPrint(); tree2.pennPrint(pw); if (op.doDep) { System.out.println("Combo: " + parser.scoreBinarizedTree(tree4b, 0)); // tc.transformTree(tree4).pennPrint(pw); tree4.pennPrint(pw); } System.out.println("Correct:" + parser.scoreBinarizedTree(binaryTree, 0)); /* if (parser.scoreBinarizedTree(tree2b,true) < parser.scoreBinarizedTree(binaryTree,true)) { System.out.println("SCORE INVERSION"); parser.validateBinarizedTree(binaryTree,0); } */ tree.pennPrint(pw); } // end if doPCFG if (op.testOptions.evalb) { if (op.doPCFG && op.doDep) { EvalbFormatWriter.writeEVALBline( tcEvalb.transformTree(tree), tcEvalb.transformTree(tree4)); } else if (op.doPCFG) { EvalbFormatWriter.writeEVALBline( tcEvalb.transformTree(tree), tcEvalb.transformTree(tree2)); } else if (op.doDep) { EvalbFormatWriter.writeEVALBline( tcEvalb.transformTree(tree), tcEvalb.transformTree(tree3db)); } } } // end for each tree in test treebank if (op.testOptions.evalb) { EvalbFormatWriter.closeEVALBfiles(); } // op.testOptions.display(); if (op.doPCFG) { pcfgPE.display(false, pw); System.out.println("Grammar size: " + stateIndex.size()); pcfgCB.display(false, pw); if (op.doDep) { comboPE.display(false, pw); } pcfgTE.display(false, pw); pcfgTEnoPunct.display(false, pw); if (op.doDep) { comboTE.display(false, pw); comboTEnoPunct.display(false, pw); } } if (op.doDep) { depTE.display(false, pw); depDE.display(false, pw); } if (op.doPCFG && op.doDep) { comboDE.display(false, pw); } // pcfgPE.printGoodBad(); }
private boolean recordTypedefs(TypeDeclaration declaration) { SourceTypeBinding binding = declaration.binding; if (binding == null) { return false; } Annotation[] annotations = declaration.annotations; if (annotations != null) { if (declaration.binding.isAnnotationType()) { for (Annotation annotation : annotations) { String typeName = Extractor.getFqn(annotation); if (typeName == null) { continue; } if (Extractor.isNestedAnnotation(typeName)) { String fqn = new String(binding.readableName()); List<Annotation> list = mMap.get(fqn); if (list == null) { list = new ArrayList<Annotation>(2); mMap.put(fqn, list); } list.add(annotation); if (mRequireHide) { Javadoc javadoc = declaration.javadoc; if (javadoc != null) { StringBuffer stringBuffer = new StringBuffer(200); javadoc.print(0, stringBuffer); String documentation = stringBuffer.toString(); if (!documentation.contains("@hide")) { Extractor.warning( getFileName() + ": The typedef annotation " + fqn + " should specify @hide in a doc comment"); } } } if (mRequireSourceRetention && !Extractor.hasSourceRetention(annotations)) { Extractor.warning( getFileName() + ": The typedef annotation " + fqn + " should have @Retention(RetentionPolicy.SOURCE)"); } if (declaration.binding != null && (declaration.modifiers & ClassFileConstants.AccPublic) == 0) { StringBuilder sb = new StringBuilder(100); for (char c : declaration.binding.qualifiedPackageName()) { if (c == '.') { sb.append('/'); } else { sb.append(c); } } sb.append(File.separatorChar); for (char c : declaration.binding.qualifiedSourceName()) { if (c == '.') { sb.append('$'); } else { sb.append(c); } } mTypedefClasses.add(sb.toString()); } } } } } return true; }