示例#1
0
  public void add(JCas jcas, Set<String> inputPaths, Type sentenceType) throws IOException {
    ConditionalFrequencyDistribution<Integer, String> cfd =
        new ConditionalFrequencyDistribution<Integer, String>();

    CAS cas = jcas.getCas();

    for (AnnotationFS annotation : CasUtil.select(cas, sentenceType)) {

      for (String path : inputPaths) {

        String[] segments = path.split("/", 2);
        String typeName = segments[0];

        Type type = getInputType(cas, typeName);

        List<AnnotationFS> tokens = CasUtil.selectCovered(cas, type, annotation);

        List<String> tokenStrings;
        try {
          tokenStrings = createStringList(tokens, segments);
        } catch (AnalysisEngineProcessException e) {
          throw new IOException(e);
        }

        for (int ngramLength = minNgramLength; ngramLength <= maxNgramLength; ngramLength++) {
          cfd.incAll(ngramLength, new NGramStringIterable(tokenStrings, ngramLength, ngramLength));
        }
      }
    }

    add(cfd);
  }
示例#2
0
  /** Write the frequency distributions to the corresponding n-gram files. */
  private void writeFrequencyDistributionsToNGramFiles(
      ConditionalFrequencyDistribution<Integer, String> cfd) throws IOException {
    for (int level : cfd.getConditions()) {

      if (!ngramWriters.containsKey(level)) {
        throw new IOException("No writer for ngram level " + level + " initialized.");
      }

      writeNGramFile(cfd, level);
    }
  }
示例#3
0
  private void writeNGramFile(ConditionalFrequencyDistribution<Integer, String> cfd, int level)
      throws IOException {
    FrequencyDistribution<String> letterFD = letterFDs.get(level);
    BufferedWriter writer = ngramWriters.get(level);
    for (String key : cfd.getFrequencyDistribution(level).getKeys()) {

      // add starting letter to frequency distribution
      if (key.length() > 1) {
        String subsKey = key.substring(0, 2);
        String subsKeyLowered = subsKey.toLowerCase();
        letterFD.addSample(subsKeyLowered, 1);
      } else {
        String subsKey = key.substring(0, 1);
        String subsKeyLowered = subsKey.toLowerCase();
        letterFD.addSample(subsKeyLowered, 1);
      }

      writer.write(key);
      writer.write(TAB);
      writer.write(Long.toString(cfd.getCount(level, key)));
      writer.write(LF);
    }
    writer.flush();
  }