private void assertOffsetConversionOk(String testData, String patStr) {
    // Build an entity at the location of patStr
    final Pattern pat = Pattern.compile(patStr);
    final Matcher matcher = pat.matcher(testData);

    List<Extractor.Entity> entities = new ArrayList<Extractor.Entity>();
    List<Integer> codePointOffsets = new ArrayList<Integer>();
    List<Integer> charOffsets = new ArrayList<Integer>();
    while (matcher.find()) {
      final int charOffset = matcher.start();
      charOffsets.add(charOffset);
      codePointOffsets.add(testData.codePointCount(0, charOffset));
      entities.add(new Extractor.Entity(matcher, Extractor.Entity.Type.HASHTAG, 0, 0));
    }

    extractor.modifyIndicesFromUTF16ToToUnicode(testData, entities);

    for (int i = 0; i < entities.size(); i++) {
      assertEquals(codePointOffsets.get(i), entities.get(i).getStart());
    }

    extractor.modifyIndicesFromUnicodeToUTF16(testData, entities);

    for (int i = 0; i < entities.size(); i++) {
      // This assertion could fail if the entity location is in the middle
      // of a surrogate pair, since there is no equivalent code point
      // offset to that location. It would be pathological for an entity to
      // start at that point, so we can just let the test fail in that case.
      assertEquals(charOffsets.get(i), entities.get(i).getStart());
    }
  }
  @Test(groups = "functest")
  public void shouldGetQuotesMessages() {
    String path = "./target/test-classes/";

    Properties props = PropertyHelper.loadProperties(path);
    Extractor generator = new InstapaperExtractor(props, "Citas", false, "10");
    Set<Entry> messages = generator.extract();
    assertNotNull(messages, "no messages are retrieved");
    assertTrue(messages.size() > 0, "expected more messages");

    for (Entry entry : messages) {
      System.out.println("url:" + entry.getUrl() + ".text:" + entry.getText());
    }
  }
  public void testUrlWithSpecialCCTLDWithoutProtocol() {
    String text = "MLB.tv vine.co";
    assertList(
        "Failed to extract URLs without protocol",
        new String[] {"MLB.tv", "vine.co"},
        extractor.extractURLs(text));

    List<Extractor.Entity> extracted = extractor.extractURLsWithIndices(text);
    assertEquals(extracted.get(0).getStart().intValue(), 0);
    assertEquals(extracted.get(0).getEnd().intValue(), 6);
    assertEquals(extracted.get(1).getStart().intValue(), 7);
    assertEquals(extracted.get(1).getEnd().intValue(), 14);

    extractor.setExtractURLWithoutProtocol(false);
    assertTrue("Should not extract URLs w/o protocol", extractor.extractURLs(text).isEmpty());
  }
 public void setProject(IProject project) {
   super.setProject(project);
   if (project == null) {
     isGAC = true;
   }
   dllDirs = new HashSet();
 }
Exemple #5
0
  /**
   * Auto-link hashtags, URLs, usernames and lists.
   *
   * @param text of the Tweet to auto-link
   * @return text with auto-link HTML added
   */
  public String autoLink(String text) {
    text = escapeBrackets(text);

    // extract entities
    List<Entity> entities = extractor.extractEntitiesWithIndices(text);
    return autoLinkEntities(text, entities);
  }
Exemple #6
0
  public String autoLinkAll(
      String text,
      TwitterMediaEntity twitterMediaEntity,
      MediaEntity[] mediaEntities,
      URLEntity[] urlEntities) {
    text = escapeBrackets(text);

    // extract entities
    List<Entity> entities = extractor.extractEntitiesWithIndices(text);
    return autoLinkEntities(text, twitterMediaEntity, entities, mediaEntities, urlEntities);
  }
Exemple #7
0
  public static void connect(String configFilename)
      throws SQLException, IOException, JSONException {
    // Read the configuration file
    JSONConfig.read(configFilename);

    // Connect to the database and instantiate the necessary extractors
    connection = Extractor.connect();

    // And then prepare all the extractors for use
    for (Class<? extends Extractor> cls : JSONConfig.getExtractorClasses()) {
      Extractor extractor = Extractor.getExtractor(cls);
      try {
        extractor.prepare();
      } catch (SQLException e) {
        System.err.println(
            "Cannot extract the information of the ontologies with the instance of "
                + extractor.getClass());
        e.printStackTrace();
      }
    }
  }
Exemple #8
0
  public AutolinkEx() {
    urlClass = null;
    listClass = DEFAULT_LIST_CLASS;
    usernameClass = DEFAULT_USERNAME_CLASS;
    hashtagClass = DEFAULT_HASHTAG_CLASS;
    cashtagClass = DEFAULT_CASHTAG_CLASS;
    usernameUrlBase = DEFAULT_USERNAME_URL_BASE;
    listUrlBase = DEFAULT_LIST_URL_BASE;
    hashtagUrlBase = DEFAULT_HASHTAG_URL_BASE;
    cashtagUrlBase = DEFAULT_CASHTAG_URL_BASE;
    invisibleTagAttrs = DEFAULT_INVISIBLE_TAG_ATTRS;

    extractor.setExtractURLWithoutProtocol(false);
  }
Exemple #9
0
  @Override
  public String[] getCommandExtractor(String url, String type) {
    String username = PreferencesManager.INSTANCE.getDatabaseUsername();
    String password = PreferencesManager.INSTANCE.getDatabasePassword();
    String host = PreferencesManager.INSTANCE.getDatabaseHost();
    String database = PreferencesManager.INSTANCE.getDatabaseNameSCM();
    String repo;

    if (type.equals("git")) {
      try {
        repo = SourcesManager.INSTANCE.getDownloadSourcePath(url);
      } catch (SourcesManagerError e) {
        repo = null;
      }
    } else {
      repo = url;
    }

    String[] cmd = {
      SCM_EXTRACTOR,
      "-u",
      username,
      "-p",
      password,
      "-d",
      database,
      "-H",
      host,
      "--extensions",
      "Metrics",
      "--metrics-all",
      repo
    };
    super.logger.debug("Command: " + cmd.toString());

    return cmd;
  }
Exemple #10
0
  public void createInstances(String file, Instances is) throws Exception {

    CONLLReader09 depReader = new CONLLReader09(file);

    mf.register(REL, "<root-type>");

    // register at least one predicate since the parsing data might not contain predicates as in
    // the Japaness corpus but the development sets contains some

    long sl = 0;

    System.out.print("Registering feature parts of sentence: ");
    int ic = 0;
    int del = 0;
    while (true) {
      SentenceData09 instance = depReader.getNext();
      if (instance == null) break;
      ic++;

      sl += instance.labels.length;

      if (ic % 1000 == 0) {
        del = outValue(ic, del);
      }

      String[] labs1 = instance.labels;
      for (int i1 = 0; i1 < labs1.length; i1++) mf.register(REL, labs1[i1]);

      String[] w = instance.forms;
      for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1]));

      w = instance.plemmas;
      for (int i1 = 0; i1 < w.length; i1++) mf.register(WORD, depReader.normalize(w[i1]));

      w = instance.ppos;
      for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]);

      w = instance.gpos;
      for (int i1 = 0; i1 < w.length; i1++) mf.register(POS, w[i1]);

      if (instance.feats != null) {
        String fs[][] = instance.feats;
        for (int i1 = 0; i1 < fs.length; i1++) {
          w = fs[i1];
          if (w == null) continue;
          for (int i2 = 0; i2 < w.length; i2++) mf.register(FEAT, w[i2]);
        }
      }

      if ((ic - 1) > options.count) break;
    }
    del = outValue(ic, del);

    System.out.println();
    Extractor.initFeatures();

    Extractor.maxForm = mf.getFeatureCounter().get(WORD);

    if (options.clusterFile == null) cl = new Cluster();
    else cl = new Cluster(options.clusterFile, mf, 6);

    mf.calculateBits();
    Extractor.initStat(options.featureCreation);

    System.out.println("" + mf.toString());

    for (Extractor e : extractor) e.init();

    depReader.startReading(file);

    int num1 = 0;

    is.init(ic, new MFO());

    Edges.init(mf.getFeatureCounter().get(POS));

    System.out.print("Creating edge filters and read corpus: ");
    del = 0;

    while (true) {
      if (num1 % 100 == 0) del = outValue(num1, del);

      SentenceData09 instance1 = depReader.getNext(is);

      if (instance1 == null) break;

      int last = is.size() - 1;
      short[] pos = is.pposs[last];

      for (int k = 0; k < is.length(last); k++) {
        if (is.heads[last][k] < 0) continue;
        Edges.put(pos[is.heads[last][k]], pos[k], is.labels[last][k]);
        //				Edges.put(pos[k],pos[is.heads[last][k]], is.labels[last][k]);
      }

      if (!options.allFeatures && num1 > options.count) break;

      num1++;
    }
    del = outValue(num1, del);
    System.out.println();
    Edges.findDefault();
  }
  @Override
  public Object call() {

    try {

      F2SF f = para;

      short[] pos = is.pposs[i];
      int length = pos.length;

      long[] gvs = new long[50];
      long[] svs = new long[220];

      while (true) {

        DSet set = get();
        if (set == null) {
          break;
        }

        int w1 = set.w1;
        int w2 = set.w2;

        f.clear();
        extractor.basic(pos, w1, w2, f);
        d.pl[w1][w2] = f.getScoreF();

        f.clear();

        extractor.basic(pos, w2, w1, f);
        d.pl[w2][w1] = f.getScoreF();

        short[] labels = Edges.get(pos[w1], pos[w2]);
        float[] lab = d.lab[w1][w2];

        final Long2IntInterface li = extractor.li;

        int c = extractor.firstm(is, i, w1, w2, 0, cluster, svs);

        for (int l = 0; l < lab.length; l++) {
          lab[l] = -100;
        }

        for (int l = 0; l < labels.length; l++) {
          short label = labels[l];

          f.clear();
          int lv = extractor.d0.computeLabeValue(label, Extractor.s_type);
          for (int k = 0; k < c; k++) {
            if (svs[k] > 0) {
              f.add(li.l2i(svs[k] + lv));
            }
          }

          lab[label] = f.getScoreF();
        }

        labels = Edges.get(pos[w2], pos[w1]);
        lab = d.lab[w2][w1];

        for (int l = 0; l < lab.length; l++) {
          lab[l] = -100;
        }

        for (int l = 0; l < labels.length; l++) {
          int label = labels[l];

          f.clear();
          int lv = extractor.d0.computeLabeValue(label + Extractor.s_rel1, Extractor.s_type);
          for (int k = 0; k < c; k++) {
            if (svs[k] > 0) {
              f.add(li.l2i(svs[k] + lv));
            }
          }

          lab[label] = f.getScoreF();
        }

        int s = w1 < w2 ? w1 : w2;
        int e = w1 < w2 ? w2 : w1;

        for (int m = 0; m < length; m++) {

          int g = (m == s || e == m) ? -1 : m;

          int cn = extractor.second(is, i, w1, w2, g, 0, cluster, svs);
          int cc = extractor.addClusterFeatures(is, i, w1, w2, g, cluster, 0, gvs, 0);
          // for(int k=0;k<c;k++) dl1.map(f,svs[k]);

          if (m >= w1) {
            labels = Edges.get(pos[w1], pos[w2]);
            float[] lab2 = new float[labels.length];
            for (int l = 0; l < labels.length; l++) {

              short label = labels[l];

              int lx = label + Extractor.s_rel1 * (g < w2 ? 0 : 2);

              f.clear();
              int lv = extractor.d0.computeLabeValue(lx, Extractor.s_type);
              for (int k = 0; k < cn; k++) {
                if (svs[k] > 0) {
                  f.add(li.l2i(svs[k] + lv));
                }
              }
              for (int k = 0; k < cc; k++) {
                if (gvs[k] > 0) {
                  f.add(li.l2i(gvs[k] + lv));
                }
              }

              lab2[l] = f.getScoreF();
            }
            d.gra[w1][w2][m] = lab2;
          }

          if (m <= w2) {
            labels = Edges.get(pos[w2], pos[w1]);
            float lab2[];
            d.gra[w2][w1][m] = lab2 = new float[labels.length];
            for (int l = 0; l < labels.length; l++) {

              int label = labels[l];
              int lx = label + Extractor.s_rel1 * (1 + (g < w1 ? 0 : 2));

              f.clear();
              int lv = extractor.d0.computeLabeValue(lx, Extractor.s_type);
              for (int k = 0; k < cn; k++) {
                if (svs[k] > 0) {
                  f.add(li.l2i(svs[k] + lv));
                }
              }
              for (int k = 0; k < cc; k++) {
                if (gvs[k] > 0) {
                  f.add(li.l2i(gvs[k] + lv));
                }
              }

              lab2[l] = f.getScoreF();
            }
          }

          g = (m == s || e == m) ? -1 : m;

          //	int cn = extractor.second(is,i,w1,w2,g,0, cluster, svs,Extractor._SIB);
          if (m >= w1 && m <= w2) {
            labels = Edges.get(pos[w1], pos[w2]);
            float lab2[] = new float[labels.length];
            d.sib[w1][w2][m] = lab2;

            for (int l = 0; l < labels.length; l++) {

              short label = labels[l];

              int lx = label + Extractor.s_rel1 * (8);
              f.clear();
              int lv = extractor.d0.computeLabeValue(lx, Extractor.s_type);
              for (int k = 0; k < cn; k++) {
                if (svs[k] > 0) {
                  f.add(li.l2i(svs[k] + lv));
                }
              }
              for (int k = 0; k < cc; k++) {
                if (gvs[k] > 0) {
                  f.add(li.l2i(gvs[k] + lv));
                }
              }

              lab2[l] = (float) f.score; // f.getScoreF();
            }
          }
          if (m >= w1 && m <= w2) {
            labels = Edges.get(pos[w2], pos[w1]);
            float[] lab2 = new float[labels.length];
            d.sib[w2][w1][m] = lab2;
            for (int l = 0; l < labels.length; l++) {

              int label = labels[l];

              int lx = label + Extractor.s_rel1 * (9);

              f.clear();
              int lv = extractor.d0.computeLabeValue(lx, Extractor.s_type);
              for (int k = 0; k < cn; k++) {
                if (svs[k] > 0) {
                  f.add(li.l2i(svs[k] + lv));
                }
              }
              for (int k = 0; k < cc; k++) {
                if (gvs[k] > 0) {
                  f.add(li.l2i(gvs[k] + lv));
                }
              }

              lab2[l] = f.score; // f.getScoreF();
            }
          }
        }
      }

    } catch (Exception e) {
      e.printStackTrace();
    }
    return null;
  }
 public boolean cancelJob() {
   cancelSignalled = true;
   return extractor.cancelExtraction();
 }
Exemple #13
0
 /**
  * Auto-link $cashtag references in the provided Tweet text. The $cashtag links will have the
  * cashtagClass CSS class added.
  *
  * @param text of the Tweet to auto-link
  * @return text with auto-link HTML added
  */
 public String autoLinkCashtags(String text) {
   return autoLinkEntities(text, extractor.extractCashtagsWithIndices(text));
 }
Exemple #14
0
 /**
  * Auto-link URLs in the Tweet text provided.
  *
  * <p>This only auto-links URLs with protocol.
  *
  * @param text of the Tweet to auto-link
  * @return text with auto-link HTML added
  */
 public String autoLinkURLs(String text) {
   return autoLinkEntities(text, extractor.extractURLsWithIndices(text));
 }
Exemple #15
0
 /**
  * Auto-link the @username and @username/list references in the provided text. Links to @username
  * references will have the usernameClass CSS classes added. Links to @username/list references
  * will have the listClass CSS class added.
  *
  * @param text of the Tweet to auto-link
  * @return text with auto-link HTML added
  */
 public String autoLinkUsernamesAndLists(String text) {
   return autoLinkEntities(text, extractor.extractMentionsOrListsWithIndices(text));
 }
  public void extractToFiles(
      File outDir, ExtractionObserver obs, boolean allowCancel, boolean convertData)
      throws IOException {
    cancelSignalled = false;

    if (outDir.exists() && !outDir.isDirectory()) {
      throw new IllegalArgumentException(outDir.getAbsolutePath() + " is not a directory");
    }

    LodEntry currentEntry;

    if (!outDir.exists()) {
      outDir.mkdirs();
      obs.directoryCreated(outDir);
    }

    while (entriesToExtractIterator.hasNext() && !(allowCancel && cancelSignalled)) {
      currentEntry = (LodEntry) entriesToExtractIterator.next();
      String filename = currentEntry.getFileName();

      FormatConverter converter = null;
      if (convertData) converter = currentEntry.getFormatConverter();
      else {
        converter = new NullFormatConverter();
        filename = filename + ".raw";
      }

      String identifier = currentEntry.getName();

      if (converter.requiresMultipleStreams()) {
        // IMPLEMENT: replace basefilename
        String baseFilename = filename;
        String[] filenames = converter.getSuggestedFilenames(baseFilename);
        OutputStream[] outputStreamArray = new OutputStream[filenames.length];
        for (int i = 0; i < outputStreamArray.length; ++i) {
          outputStreamArray[i] =
              new BufferedOutputStream(new FileOutputStream(new File(outDir, filenames[i])));
        }
        converter.setDestinationOutputStreamsForNewFormat(outputStreamArray, currentEntry);
      } else {
        converter.setDestinationOutputStreamForNewFormat(
            new BufferedOutputStream(new FileOutputStream(new File(outDir, filename))),
            currentEntry);
      }

      Extractor extractor = new Extractor();

      extractor.convert(
          identifier,
          currentEntry.getData(),
          converter.getSourceOutputStreamForOldFormat(),
          (null != obs) ? new EntryObserver(obs, total) : null,
          allowCancel);

      if (null != obs) {
        obs.extractionProgress(identifier, counter++ / total);
      }
    }

    if (obs != null) {
      obs.extractionFinished("Done");
    }
  }
Exemple #17
0
 public static void closeConnection() {
   Extractor.closeConnection();
 }
Exemple #18
0
  private ObjectDataExtractor(final String odeFolderPath, final String mapPath) throws Exception {
    Log.entry(odeFolderPath, mapPath);
    final MPQArchive map = new MPQArchive(mapPath);

    final File odeFolder = new File(odeFolderPath);
    final File jFile = new File(odeFolder, FILE_NAME_SCRIPT);
    final File wtsFile = new File(odeFolder, FILE_NAME_WTS);
    final File w3uFile = new File(odeFolder, FILE_NAME_W3U);
    final File w3tFile = new File(odeFolder, FILE_NAME_W3T);
    final File w3bFile = new File(odeFolder, FILE_NAME_W3B);

    map.extractFile(FILE_NAME_SCRIPT, jFile);
    map.extractFile(FILE_NAME_WTS, wtsFile);

    Log.info("Loading JASS into memory");
    String scriptContent = readFileToString(jFile.getPath(), StandardCharsets.UTF_8);
    Log.info("Loading WTS from file");
    final WTSFile wts = new WTSFile(wtsFile.getPath());

    final List<Extractor<?, ?>> extractors = new LinkedList<Extractor<?, ?>>();

    // -------------------------------------------------------
    // Unit Data Extractor
    extractors.add(
        new UnitDataExtractor(
            odeFolder, loadW3OFile(W3UFile.class, map, FILE_NAME_W3U, w3uFile, wts)));

    // -------------------------------------------------------
    // Item Data Extractor
    extractors.add(
        new ItemDataExtractor(
            odeFolder, loadW3OFile(W3TFile.class, map, FILE_NAME_W3T, w3tFile, wts)));

    // -------------------------------------------------------
    // Destructable Data Extractor
    extractors.add(
        new DestructableDataExtractor(
            odeFolder, loadW3OFile(W3BFile.class, map, FILE_NAME_W3B, w3bFile, wts)));

    for (final Extractor<?, ?> extractor : extractors) {
      scriptContent = extractor.processScript(scriptContent);
    }

    Log.info("Writing new script file to disk");
    final FileWriter fw = new FileWriter(jFile);
    fw.write(scriptContent);
    fw.close();

    Log.info("Replacing script file in map");
    final MPQFile file = new MPQFile(map, FILE_NAME_SCRIPT, MPQFileOpenScope.MPQ);
    file.removeFromArchive();
    file.close();

    final MPQCompressionFlags compr = new MPQCompressionFlags();
    compr.setCompression(Compression.BZIP2);

    map.addFile(jFile.getAbsolutePath(), FILE_NAME_SCRIPT, MPQFileFlags.fromInteger(0x200), compr);
    map.compactArchive((String) null);
    map.close();

    Log.exit();
  }
  public static void main(String[] args) {
    Options op = new Options(new EnglishTreebankParserParams());
    // op.tlpParams may be changed to something else later, so don't use it till
    // after options are parsed.

    System.out.println(StringUtils.toInvocationString("FactoredParser", args));

    String path = "/u/nlp/stuff/corpora/Treebank3/parsed/mrg/wsj";
    int trainLow = 200, trainHigh = 2199, testLow = 2200, testHigh = 2219;
    String serializeFile = null;

    int i = 0;
    while (i < args.length && args[i].startsWith("-")) {
      if (args[i].equalsIgnoreCase("-path") && (i + 1 < args.length)) {
        path = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-train") && (i + 2 < args.length)) {
        trainLow = Integer.parseInt(args[i + 1]);
        trainHigh = Integer.parseInt(args[i + 2]);
        i += 3;
      } else if (args[i].equalsIgnoreCase("-test") && (i + 2 < args.length)) {
        testLow = Integer.parseInt(args[i + 1]);
        testHigh = Integer.parseInt(args[i + 2]);
        i += 3;
      } else if (args[i].equalsIgnoreCase("-serialize") && (i + 1 < args.length)) {
        serializeFile = args[i + 1];
        i += 2;
      } else if (args[i].equalsIgnoreCase("-tLPP") && (i + 1 < args.length)) {
        try {
          op.tlpParams = (TreebankLangParserParams) Class.forName(args[i + 1]).newInstance();
        } catch (ClassNotFoundException e) {
          System.err.println("Class not found: " + args[i + 1]);
          throw new RuntimeException(e);
        } catch (InstantiationException e) {
          System.err.println("Couldn't instantiate: " + args[i + 1] + ": " + e.toString());
          throw new RuntimeException(e);
        } catch (IllegalAccessException e) {
          System.err.println("illegal access" + e);
          throw new RuntimeException(e);
        }
        i += 2;
      } else if (args[i].equals("-encoding")) {
        // sets encoding for TreebankLangParserParams
        op.tlpParams.setInputEncoding(args[i + 1]);
        op.tlpParams.setOutputEncoding(args[i + 1]);
        i += 2;
      } else {
        i = op.setOptionOrWarn(args, i);
      }
    }
    // System.out.println(tlpParams.getClass());
    TreebankLanguagePack tlp = op.tlpParams.treebankLanguagePack();

    op.trainOptions.sisterSplitters =
        new HashSet<String>(Arrays.asList(op.tlpParams.sisterSplitters()));
    //    BinarizerFactory.TreeAnnotator.setTreebankLang(tlpParams);
    PrintWriter pw = op.tlpParams.pw();

    op.testOptions.display();
    op.trainOptions.display();
    op.display();
    op.tlpParams.display();

    // setup tree transforms
    Treebank trainTreebank = op.tlpParams.memoryTreebank();
    MemoryTreebank testTreebank = op.tlpParams.testMemoryTreebank();
    // Treebank blippTreebank = ((EnglishTreebankParserParams) tlpParams).diskTreebank();
    // String blippPath = "/afs/ir.stanford.edu/data/linguistic-data/BLLIP-WSJ/";
    // blippTreebank.loadPath(blippPath, "", true);

    Timing.startTime();
    System.err.print("Reading trees...");
    testTreebank.loadPath(path, new NumberRangeFileFilter(testLow, testHigh, true));
    if (op.testOptions.increasingLength) {
      Collections.sort(testTreebank, new TreeLengthComparator());
    }

    trainTreebank.loadPath(path, new NumberRangeFileFilter(trainLow, trainHigh, true));
    Timing.tick("done.");

    System.err.print("Binarizing trees...");
    TreeAnnotatorAndBinarizer binarizer;
    if (!op.trainOptions.leftToRight) {
      binarizer =
          new TreeAnnotatorAndBinarizer(
              op.tlpParams, op.forceCNF, !op.trainOptions.outsideFactor(), true, op);
    } else {
      binarizer =
          new TreeAnnotatorAndBinarizer(
              op.tlpParams.headFinder(),
              new LeftHeadFinder(),
              op.tlpParams,
              op.forceCNF,
              !op.trainOptions.outsideFactor(),
              true,
              op);
    }

    CollinsPuncTransformer collinsPuncTransformer = null;
    if (op.trainOptions.collinsPunc) {
      collinsPuncTransformer = new CollinsPuncTransformer(tlp);
    }
    TreeTransformer debinarizer = new Debinarizer(op.forceCNF);
    List<Tree> binaryTrainTrees = new ArrayList<Tree>();

    if (op.trainOptions.selectiveSplit) {
      op.trainOptions.splitters =
          ParentAnnotationStats.getSplitCategories(
              trainTreebank,
              op.trainOptions.tagSelectiveSplit,
              0,
              op.trainOptions.selectiveSplitCutOff,
              op.trainOptions.tagSelectiveSplitCutOff,
              op.tlpParams.treebankLanguagePack());
      if (op.trainOptions.deleteSplitters != null) {
        List<String> deleted = new ArrayList<String>();
        for (String del : op.trainOptions.deleteSplitters) {
          String baseDel = tlp.basicCategory(del);
          boolean checkBasic = del.equals(baseDel);
          for (Iterator<String> it = op.trainOptions.splitters.iterator(); it.hasNext(); ) {
            String elem = it.next();
            String baseElem = tlp.basicCategory(elem);
            boolean delStr = checkBasic && baseElem.equals(baseDel) || elem.equals(del);
            if (delStr) {
              it.remove();
              deleted.add(elem);
            }
          }
        }
        System.err.println("Removed from vertical splitters: " + deleted);
      }
    }
    if (op.trainOptions.selectivePostSplit) {
      TreeTransformer myTransformer =
          new TreeAnnotator(op.tlpParams.headFinder(), op.tlpParams, op);
      Treebank annotatedTB = trainTreebank.transform(myTransformer);
      op.trainOptions.postSplitters =
          ParentAnnotationStats.getSplitCategories(
              annotatedTB,
              true,
              0,
              op.trainOptions.selectivePostSplitCutOff,
              op.trainOptions.tagSelectivePostSplitCutOff,
              op.tlpParams.treebankLanguagePack());
    }

    if (op.trainOptions.hSelSplit) {
      binarizer.setDoSelectiveSplit(false);
      for (Tree tree : trainTreebank) {
        if (op.trainOptions.collinsPunc) {
          tree = collinsPuncTransformer.transformTree(tree);
        }
        // tree.pennPrint(tlpParams.pw());
        tree = binarizer.transformTree(tree);
        // binaryTrainTrees.add(tree);
      }
      binarizer.setDoSelectiveSplit(true);
    }
    for (Tree tree : trainTreebank) {
      if (op.trainOptions.collinsPunc) {
        tree = collinsPuncTransformer.transformTree(tree);
      }
      tree = binarizer.transformTree(tree);
      binaryTrainTrees.add(tree);
    }
    if (op.testOptions.verbose) {
      binarizer.dumpStats();
    }

    List<Tree> binaryTestTrees = new ArrayList<Tree>();
    for (Tree tree : testTreebank) {
      if (op.trainOptions.collinsPunc) {
        tree = collinsPuncTransformer.transformTree(tree);
      }
      tree = binarizer.transformTree(tree);
      binaryTestTrees.add(tree);
    }
    Timing.tick("done."); // binarization
    BinaryGrammar bg = null;
    UnaryGrammar ug = null;
    DependencyGrammar dg = null;
    // DependencyGrammar dgBLIPP = null;
    Lexicon lex = null;
    Index<String> stateIndex = new HashIndex<String>();

    // extract grammars
    Extractor<Pair<UnaryGrammar, BinaryGrammar>> bgExtractor =
        new BinaryGrammarExtractor(op, stateIndex);
    // Extractor bgExtractor = new SmoothedBinaryGrammarExtractor();//new BinaryGrammarExtractor();
    // Extractor lexExtractor = new LexiconExtractor();

    // Extractor dgExtractor = new DependencyMemGrammarExtractor();

    if (op.doPCFG) {
      System.err.print("Extracting PCFG...");
      Pair<UnaryGrammar, BinaryGrammar> bgug = null;
      if (op.trainOptions.cheatPCFG) {
        List<Tree> allTrees = new ArrayList<Tree>(binaryTrainTrees);
        allTrees.addAll(binaryTestTrees);
        bgug = bgExtractor.extract(allTrees);
      } else {
        bgug = bgExtractor.extract(binaryTrainTrees);
      }
      bg = bgug.second;
      bg.splitRules();
      ug = bgug.first;
      ug.purgeRules();
      Timing.tick("done.");
    }
    System.err.print("Extracting Lexicon...");
    Index<String> wordIndex = new HashIndex<String>();
    Index<String> tagIndex = new HashIndex<String>();
    lex = op.tlpParams.lex(op, wordIndex, tagIndex);
    lex.train(binaryTrainTrees);
    Timing.tick("done.");

    if (op.doDep) {
      System.err.print("Extracting Dependencies...");
      binaryTrainTrees.clear();
      Extractor<DependencyGrammar> dgExtractor =
          new MLEDependencyGrammarExtractor(op, wordIndex, tagIndex);
      // dgBLIPP = (DependencyGrammar) dgExtractor.extract(new
      // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new
      // TransformTreeDependency(tlpParams,true));

      // DependencyGrammar dg1 = dgExtractor.extract(trainTreebank.iterator(), new
      // TransformTreeDependency(op.tlpParams, true));
      // dgBLIPP=(DependencyGrammar)dgExtractor.extract(blippTreebank.iterator(),new
      // TransformTreeDependency(tlpParams));

      // dg = (DependencyGrammar) dgExtractor.extract(new
      // ConcatenationIterator(trainTreebank.iterator(),blippTreebank.iterator()),new
      // TransformTreeDependency(tlpParams));
      // dg=new DependencyGrammarCombination(dg1,dgBLIPP,2);
      dg =
          dgExtractor.extract(
              binaryTrainTrees); // uses information whether the words are known or not, discards
      // unknown words
      Timing.tick("done.");
      // System.out.print("Extracting Unknown Word Model...");
      // UnknownWordModel uwm = (UnknownWordModel)uwmExtractor.extract(binaryTrainTrees);
      // Timing.tick("done.");
      System.out.print("Tuning Dependency Model...");
      dg.tune(binaryTestTrees);
      // System.out.println("TUNE DEPS: "+tuneDeps);
      Timing.tick("done.");
    }

    BinaryGrammar boundBG = bg;
    UnaryGrammar boundUG = ug;

    GrammarProjection gp = new NullGrammarProjection(bg, ug);

    // serialization
    if (serializeFile != null) {
      System.err.print("Serializing parser...");
      LexicalizedParser.saveParserDataToSerialized(
          new ParserData(lex, bg, ug, dg, stateIndex, wordIndex, tagIndex, op), serializeFile);
      Timing.tick("done.");
    }

    // test: pcfg-parse and output

    ExhaustivePCFGParser parser = null;
    if (op.doPCFG) {
      parser = new ExhaustivePCFGParser(boundBG, boundUG, lex, op, stateIndex, wordIndex, tagIndex);
    }

    ExhaustiveDependencyParser dparser =
        ((op.doDep && !op.testOptions.useFastFactored)
            ? new ExhaustiveDependencyParser(dg, lex, op, wordIndex, tagIndex)
            : null);

    Scorer scorer =
        (op.doPCFG ? new TwinScorer(new ProjectionScorer(parser, gp, op), dparser) : null);
    // Scorer scorer = parser;
    BiLexPCFGParser bparser = null;
    if (op.doPCFG && op.doDep) {
      bparser =
          (op.testOptions.useN5)
              ? new BiLexPCFGParser.N5BiLexPCFGParser(
                  scorer, parser, dparser, bg, ug, dg, lex, op, gp, stateIndex, wordIndex, tagIndex)
              : new BiLexPCFGParser(
                  scorer,
                  parser,
                  dparser,
                  bg,
                  ug,
                  dg,
                  lex,
                  op,
                  gp,
                  stateIndex,
                  wordIndex,
                  tagIndex);
    }

    Evalb pcfgPE = new Evalb("pcfg  PE", true);
    Evalb comboPE = new Evalb("combo PE", true);
    AbstractEval pcfgCB = new Evalb.CBEval("pcfg  CB", true);

    AbstractEval pcfgTE = new TaggingEval("pcfg  TE");
    AbstractEval comboTE = new TaggingEval("combo TE");
    AbstractEval pcfgTEnoPunct = new TaggingEval("pcfg nopunct TE");
    AbstractEval comboTEnoPunct = new TaggingEval("combo nopunct TE");
    AbstractEval depTE = new TaggingEval("depnd TE");

    AbstractEval depDE =
        new UnlabeledAttachmentEval("depnd DE", true, null, tlp.punctuationWordRejectFilter());
    AbstractEval comboDE =
        new UnlabeledAttachmentEval("combo DE", true, null, tlp.punctuationWordRejectFilter());

    if (op.testOptions.evalb) {
      EvalbFormatWriter.initEVALBfiles(op.tlpParams);
    }

    // int[] countByLength = new int[op.testOptions.maxLength+1];

    // Use a reflection ruse, so one can run this without needing the
    // tagger.  Using a function rather than a MaxentTagger means we
    // can distribute a version of the parser that doesn't include the
    // entire tagger.
    Function<List<? extends HasWord>, ArrayList<TaggedWord>> tagger = null;
    if (op.testOptions.preTag) {
      try {
        Class[] argsClass = {String.class};
        Object[] arguments = new Object[] {op.testOptions.taggerSerializedFile};
        tagger =
            (Function<List<? extends HasWord>, ArrayList<TaggedWord>>)
                Class.forName("edu.stanford.nlp.tagger.maxent.MaxentTagger")
                    .getConstructor(argsClass)
                    .newInstance(arguments);
      } catch (Exception e) {
        System.err.println(e);
        System.err.println("Warning: No pretagging of sentences will be done.");
      }
    }

    for (int tNum = 0, ttSize = testTreebank.size(); tNum < ttSize; tNum++) {
      Tree tree = testTreebank.get(tNum);
      int testTreeLen = tree.yield().size();
      if (testTreeLen > op.testOptions.maxLength) {
        continue;
      }
      Tree binaryTree = binaryTestTrees.get(tNum);
      // countByLength[testTreeLen]++;
      System.out.println("-------------------------------------");
      System.out.println("Number: " + (tNum + 1));
      System.out.println("Length: " + testTreeLen);

      // tree.pennPrint(pw);
      // System.out.println("XXXX The binary tree is");
      // binaryTree.pennPrint(pw);
      // System.out.println("Here are the tags in the lexicon:");
      // System.out.println(lex.showTags());
      // System.out.println("Here's the tagnumberer:");
      // System.out.println(Numberer.getGlobalNumberer("tags").toString());

      long timeMil1 = System.currentTimeMillis();
      Timing.tick("Starting parse.");
      if (op.doPCFG) {
        // System.err.println(op.testOptions.forceTags);
        if (op.testOptions.forceTags) {
          if (tagger != null) {
            // System.out.println("Using a tagger to set tags");
            // System.out.println("Tagged sentence as: " +
            // tagger.processSentence(cutLast(wordify(binaryTree.yield()))).toString(false));
            parser.parse(addLast(tagger.apply(cutLast(wordify(binaryTree.yield())))));
          } else {
            // System.out.println("Forcing tags to match input.");
            parser.parse(cleanTags(binaryTree.taggedYield(), tlp));
          }
        } else {
          // System.out.println("XXXX Parsing " + binaryTree.yield());
          parser.parse(binaryTree.yieldHasWord());
        }
        // Timing.tick("Done with pcfg phase.");
      }
      if (op.doDep) {
        dparser.parse(binaryTree.yieldHasWord());
        // Timing.tick("Done with dependency phase.");
      }
      boolean bothPassed = false;
      if (op.doPCFG && op.doDep) {
        bothPassed = bparser.parse(binaryTree.yieldHasWord());
        // Timing.tick("Done with combination phase.");
      }
      long timeMil2 = System.currentTimeMillis();
      long elapsed = timeMil2 - timeMil1;
      System.err.println("Time: " + ((int) (elapsed / 100)) / 10.00 + " sec.");
      // System.out.println("PCFG Best Parse:");
      Tree tree2b = null;
      Tree tree2 = null;
      // System.out.println("Got full best parse...");
      if (op.doPCFG) {
        tree2b = parser.getBestParse();
        tree2 = debinarizer.transformTree(tree2b);
      }
      // System.out.println("Debinarized parse...");
      // tree2.pennPrint();
      // System.out.println("DepG Best Parse:");
      Tree tree3 = null;
      Tree tree3db = null;
      if (op.doDep) {
        tree3 = dparser.getBestParse();
        // was: but wrong Tree tree3db = debinarizer.transformTree(tree2);
        tree3db = debinarizer.transformTree(tree3);
        tree3.pennPrint(pw);
      }
      // tree.pennPrint();
      // ((Tree)binaryTrainTrees.get(tNum)).pennPrint();
      // System.out.println("Combo Best Parse:");
      Tree tree4 = null;
      if (op.doPCFG && op.doDep) {
        try {
          tree4 = bparser.getBestParse();
          if (tree4 == null) {
            tree4 = tree2b;
          }
        } catch (NullPointerException e) {
          System.err.println("Blocked, using PCFG parse!");
          tree4 = tree2b;
        }
      }
      if (op.doPCFG && !bothPassed) {
        tree4 = tree2b;
      }
      // tree4.pennPrint();
      if (op.doDep) {
        depDE.evaluate(tree3, binaryTree, pw);
        depTE.evaluate(tree3db, tree, pw);
      }
      TreeTransformer tc = op.tlpParams.collinizer();
      TreeTransformer tcEvalb = op.tlpParams.collinizerEvalb();
      if (op.doPCFG) {
        // System.out.println("XXXX Best PCFG was: ");
        // tree2.pennPrint();
        // System.out.println("XXXX Transformed best PCFG is: ");
        // tc.transformTree(tree2).pennPrint();
        // System.out.println("True Best Parse:");
        // tree.pennPrint();
        // tc.transformTree(tree).pennPrint();
        pcfgPE.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
        pcfgCB.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);
        Tree tree4b = null;
        if (op.doDep) {
          comboDE.evaluate((bothPassed ? tree4 : tree3), binaryTree, pw);
          tree4b = tree4;
          tree4 = debinarizer.transformTree(tree4);
          if (op.nodePrune) {
            NodePruner np = new NodePruner(parser, debinarizer);
            tree4 = np.prune(tree4);
          }
          // tree4.pennPrint();
          comboPE.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
        }
        // pcfgTE.evaluate(tree2, tree);
        pcfgTE.evaluate(tcEvalb.transformTree(tree2), tcEvalb.transformTree(tree), pw);
        pcfgTEnoPunct.evaluate(tc.transformTree(tree2), tc.transformTree(tree), pw);

        if (op.doDep) {
          comboTE.evaluate(tcEvalb.transformTree(tree4), tcEvalb.transformTree(tree), pw);
          comboTEnoPunct.evaluate(tc.transformTree(tree4), tc.transformTree(tree), pw);
        }
        System.out.println("PCFG only: " + parser.scoreBinarizedTree(tree2b, 0));

        // tc.transformTree(tree2).pennPrint();
        tree2.pennPrint(pw);

        if (op.doDep) {
          System.out.println("Combo: " + parser.scoreBinarizedTree(tree4b, 0));
          // tc.transformTree(tree4).pennPrint(pw);
          tree4.pennPrint(pw);
        }
        System.out.println("Correct:" + parser.scoreBinarizedTree(binaryTree, 0));
        /*
        if (parser.scoreBinarizedTree(tree2b,true) < parser.scoreBinarizedTree(binaryTree,true)) {
          System.out.println("SCORE INVERSION");
          parser.validateBinarizedTree(binaryTree,0);
        }
        */
        tree.pennPrint(pw);
      } // end if doPCFG

      if (op.testOptions.evalb) {
        if (op.doPCFG && op.doDep) {
          EvalbFormatWriter.writeEVALBline(
              tcEvalb.transformTree(tree), tcEvalb.transformTree(tree4));
        } else if (op.doPCFG) {
          EvalbFormatWriter.writeEVALBline(
              tcEvalb.transformTree(tree), tcEvalb.transformTree(tree2));
        } else if (op.doDep) {
          EvalbFormatWriter.writeEVALBline(
              tcEvalb.transformTree(tree), tcEvalb.transformTree(tree3db));
        }
      }
    } // end for each tree in test treebank

    if (op.testOptions.evalb) {
      EvalbFormatWriter.closeEVALBfiles();
    }

    // op.testOptions.display();
    if (op.doPCFG) {
      pcfgPE.display(false, pw);
      System.out.println("Grammar size: " + stateIndex.size());
      pcfgCB.display(false, pw);
      if (op.doDep) {
        comboPE.display(false, pw);
      }
      pcfgTE.display(false, pw);
      pcfgTEnoPunct.display(false, pw);
      if (op.doDep) {
        comboTE.display(false, pw);
        comboTEnoPunct.display(false, pw);
      }
    }
    if (op.doDep) {
      depTE.display(false, pw);
      depDE.display(false, pw);
    }
    if (op.doPCFG && op.doDep) {
      comboDE.display(false, pw);
    }
    // pcfgPE.printGoodBad();
  }
  private boolean recordTypedefs(TypeDeclaration declaration) {
    SourceTypeBinding binding = declaration.binding;
    if (binding == null) {
      return false;
    }
    Annotation[] annotations = declaration.annotations;
    if (annotations != null) {
      if (declaration.binding.isAnnotationType()) {
        for (Annotation annotation : annotations) {
          String typeName = Extractor.getFqn(annotation);
          if (typeName == null) {
            continue;
          }

          if (Extractor.isNestedAnnotation(typeName)) {
            String fqn = new String(binding.readableName());

            List<Annotation> list = mMap.get(fqn);
            if (list == null) {
              list = new ArrayList<Annotation>(2);
              mMap.put(fqn, list);
            }
            list.add(annotation);

            if (mRequireHide) {
              Javadoc javadoc = declaration.javadoc;
              if (javadoc != null) {
                StringBuffer stringBuffer = new StringBuffer(200);
                javadoc.print(0, stringBuffer);
                String documentation = stringBuffer.toString();
                if (!documentation.contains("@hide")) {
                  Extractor.warning(
                      getFileName()
                          + ": The typedef annotation "
                          + fqn
                          + " should specify @hide in a doc comment");
                }
              }
            }
            if (mRequireSourceRetention && !Extractor.hasSourceRetention(annotations)) {
              Extractor.warning(
                  getFileName()
                      + ": The typedef annotation "
                      + fqn
                      + " should have @Retention(RetentionPolicy.SOURCE)");
            }
            if (declaration.binding != null
                && (declaration.modifiers & ClassFileConstants.AccPublic) == 0) {
              StringBuilder sb = new StringBuilder(100);
              for (char c : declaration.binding.qualifiedPackageName()) {
                if (c == '.') {
                  sb.append('/');
                } else {
                  sb.append(c);
                }
              }
              sb.append(File.separatorChar);
              for (char c : declaration.binding.qualifiedSourceName()) {
                if (c == '.') {
                  sb.append('$');
                } else {
                  sb.append(c);
                }
              }
              mTypedefClasses.add(sb.toString());
            }
          }
        }
      }
    }
    return true;
  }