public StringList read() throws IOException {
    String line = lineStream.read();
    StringList name = null;

    if ((line != null) && (!StringUtil.isEmpty(line))) {
      String name2;
      // find the location of the name separator in the line of data.
      int pos = line.indexOf(' ');
      if ((pos != -1)) {
        String parsed = line.substring(0, pos);
        // the data is in ALL CAPS ... so the easiest way is to convert
        // back to standard mixed case.
        if ((parsed.length() > 2) && (parsed.startsWith("MC"))) {
          name2 =
              parsed.substring(0, 1).toUpperCase(locale)
                  + parsed.substring(1, 2).toLowerCase(locale)
                  + parsed.substring(2, 3).toUpperCase(locale)
                  + parsed.substring(3).toLowerCase(locale);
        } else {
          name2 =
              parsed.substring(0, 1).toUpperCase(locale) + parsed.substring(1).toLowerCase(locale);
        }
        name = new StringList(new String[] {name2});
      }
    }

    return name;
  }
Esempio n. 2
0
 public Span[] tokenizePos(String s) {
   boolean isWhitespace;
   List<Span> tokens = new ArrayList<Span>();
   int sl = s.length();
   int start = -1;
   char pc = 0;
   for (int ci = 0; ci <= sl; ci++) {
     char c = ci < sl ? s.charAt(ci) : ' ';
     isWhitespace = StringUtil.isWhitespace(c);
     if (!isWhitespace & start < 0) { // new token starts
       start = ci;
     }
     if (isWhitespace && start >= 0) { // end of token
       // limited support for punctations at the end of words
       if (start < ci - 1
           && (pc == '.' || pc == ',' || pc == '!' || pc == '?' || pc == ';' || pc == ':')) {
         tokens.add(new Span(start, ci - 1));
         tokens.add(new Span(ci - 1, ci));
       } else {
         tokens.add(new Span(start, ci));
       }
       start = -1;
     }
   }
   return tokens.toArray(new Span[tokens.size()]);
 }
  public static void populatePOSDictionary(
      ObjectStream<POSSample> samples, MutableTagDictionary dict, int cutoff) throws IOException {
    System.out.println("Expanding POS Dictionary ...");
    long start = System.nanoTime();

    // the data structure will store the word, the tag, and the number of
    // occurrences
    Map<String, Map<String, AtomicInteger>> newEntries =
        new HashMap<String, Map<String, AtomicInteger>>();
    POSSample sample;
    while ((sample = samples.read()) != null) {
      String[] words = sample.getSentence();
      String[] tags = sample.getTags();

      for (int i = 0; i < words.length; i++) {
        // only store words
        if (!StringPattern.recognize(words[i]).containsDigit()) {
          String word;
          if (dict.isCaseSensitive()) {
            word = words[i];
          } else {
            word = StringUtil.toLowerCase(words[i]);
          }

          if (!newEntries.containsKey(word)) {
            newEntries.put(word, new HashMap<String, AtomicInteger>());
          }

          String[] dictTags = dict.getTags(word);
          if (dictTags != null) {
            for (String tag : dictTags) {
              // for this tags we start with the cutoff
              Map<String, AtomicInteger> value = newEntries.get(word);
              if (!value.containsKey(tag)) {
                value.put(tag, new AtomicInteger(cutoff));
              }
            }
          }

          if (!newEntries.get(word).containsKey(tags[i])) {
            newEntries.get(word).put(tags[i], new AtomicInteger(1));
          } else {
            newEntries.get(word).get(tags[i]).incrementAndGet();
          }
        }
      }
    }

    // now we check if the word + tag pairs have enough occurrences, if yes we
    // add it to the dictionary
    for (Entry<String, Map<String, AtomicInteger>> wordEntry : newEntries.entrySet()) {
      List<String> tagsForWord = new ArrayList<String>();
      for (Entry<String, AtomicInteger> entry : wordEntry.getValue().entrySet()) {
        if (entry.getValue().get() >= cutoff) {
          tagsForWord.add(entry.getKey());
        }
      }
      if (tagsForWord.size() > 0) {
        dict.put(wordEntry.getKey(), tagsForWord.toArray(new String[tagsForWord.size()]));
      }
    }

    System.out.println(
        "... finished expanding POS Dictionary. [" + (System.nanoTime() - start) / 1000000 + "ms]");
  }
  public NameSample read() throws IOException {

    List<String> tokens = new ArrayList<String>();
    List<String> neTypes = new ArrayList<String>();
    boolean isClearAdaptiveData = false;

    // Empty line indicates end of sentence
    String line;
    while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line)) {
      // clear adaptive data if document mark appears following
      // CoNLL03 conventions
      if (clearFeatures.equalsIgnoreCase("docstart") && line.startsWith("-DOCSTART-")) {
        isClearAdaptiveData = true;
        String emptyLine = lineStream.read();
        if (!StringUtil.isEmpty(emptyLine))
          throw new IOException("Empty line after -DOCSTART- not empty: '" + emptyLine + "'!");
        continue;
      }
      String fields[] = line.split("\t");
      if (fields.length == 2) {
        tokens.add(fields[0]);
        neTypes.add(fields[1]);
      } else {
        throw new IOException(
            "Expected two fields per line in training data, got "
                + fields.length
                + " for line '"
                + line
                + "'!");
      }
    }
    // if no -DOCSTART- mark, check if we need to clear features every sentence
    if (clearFeatures.equalsIgnoreCase("yes")) {
      isClearAdaptiveData = true;
    }

    if (tokens.size() > 0) {
      // convert name tags into spans
      List<Span> names = new ArrayList<Span>();

      int beginIndex = -1;
      int endIndex = -1;
      for (int i = 0; i < neTypes.size(); i++) {
        String neTag = neTypes.get(i);
        if (neTag.equals("O")) {
          // O means we don't have anything this round.
          if (beginIndex != -1) {
            names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex)));
            beginIndex = -1;
            endIndex = -1;
          }
        } else if (neTag.startsWith("B-")) {
          // B- prefix means we have two same entities of the same class next to each other
          if (beginIndex != -1) {
            names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex)));
          }
          beginIndex = i;
          endIndex = i + 1;
        } else if (neTag.startsWith("I-")) {
          // I- starts or continues a current name entity
          if (beginIndex == -1) {
            beginIndex = i;
            endIndex = i + 1;
          } else if (!neTag.endsWith(neTypes.get(beginIndex).substring(1))) {
            // we have a new tag type following a tagged word series
            // also may not have the same I- starting the previous!
            names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex)));
            beginIndex = i;
            endIndex = i + 1;
          } else {
            endIndex++;
          }
        } else {
          throw new IOException("Invalid tag: " + neTag);
        }
      }

      // if one span remains, create it here
      if (beginIndex != -1) names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex)));

      return new NameSample(
          tokens.toArray(new String[tokens.size()]),
          names.toArray(new Span[names.size()]),
          isClearAdaptiveData);
    } else if (line != null) {
      // Just filter out empty events, if two lines in a row are empty
      return read();
    } else {
      // source stream is not returning anymore lines
      return null;
    }
  }