@Before
  public void setup() throws IOException, URISyntaxException {
    ADTokenSampleStreamFactory factory =
        new ADTokenSampleStreamFactory(ADTokenSampleStreamFactory.Parameters.class);

    File dict =
        new File(
            getClass()
                .getClassLoader()
                .getResource("opennlp/tools/tokenize/latin-detokenizer.xml")
                .toURI());
    File data =
        new File(
            getClass().getClassLoader().getResource("opennlp/tools/formats/ad.sample").toURI());
    String[] args = {
      "-data",
      data.getCanonicalPath(),
      "-encoding",
      "UTF-8",
      "-lang",
      "pt",
      "-detokenizer",
      dict.getCanonicalPath()
    };
    ObjectStream<TokenSample> tokenSampleStream = factory.create(args);

    TokenSample sample = tokenSampleStream.read();

    while (sample != null) {
      samples.add(sample);
      sample = tokenSampleStream.read();
    }
  }
  public void run(String[] args) {

    if (0 == args.length) {
      System.out.println(getHelp());
    } else {

      DoccatModel model = new DoccatModelLoader().load(new File(args[0]));

      DocumentCategorizerME doccat = new DocumentCategorizerME(model);

      ObjectStream<String> documentStream =
          new ParagraphStream(new PlainTextByLineStream(new InputStreamReader(System.in)));

      PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "doc");
      perfMon.start();

      try {
        String document;
        while ((document = documentStream.read()) != null) {
          double prob[] = doccat.categorize(WhitespaceTokenizer.INSTANCE.tokenize(document));
          String category = doccat.getBestCategory(prob);

          DocumentSample sample = new DocumentSample(category, document);
          System.out.println(sample.toString());

          perfMon.incrementCounter();
        }
      } catch (IOException e) {
        CmdLineUtil.handleStdinIoError(e);
      }

      perfMon.stopAndPrintFinalResult();
    }
  }
  public void run(String[] args) {
    Parameters params = validateAndParseParams(args, Parameters.class);

    File testData = new File(params.getCensusData());
    File dictOutFile = new File(params.getDict());

    CmdLineUtil.checkInputFile("Name data", testData);
    CmdLineUtil.checkOutputFile("Dictionary file", dictOutFile);

    FileInputStream sampleDataIn = CmdLineUtil.openInFile(testData);
    ObjectStream<StringList> sampleStream =
        new NameFinderCensus90NameStream(sampleDataIn, Charset.forName(params.getEncoding()));

    Dictionary mDictionary;
    try {
      System.out.println("Creating Dictionary...");
      mDictionary = createDictionary(sampleStream);
    } catch (IOException e) {
      throw new TerminateToolException(
          -1, "IO error while reading training data or indexing data: " + e.getMessage(), e);
    } finally {
      try {
        sampleStream.close();
      } catch (IOException e) {
        // sorry this can fail..
      }
    }

    System.out.println("Saving Dictionary...");

    OutputStream out = null;

    try {
      out = new FileOutputStream(dictOutFile);
      mDictionary.serialize(out);
    } catch (IOException e) {
      throw new TerminateToolException(
          -1, "IO error while writing dictionary file: " + e.getMessage(), e);
    } finally {
      if (out != null)
        try {
          out.close();
        } catch (IOException e) {
          // file might be damaged
          throw new TerminateToolException(
              -1, "Attention: Failed to correctly write dictionary:" + e.getMessage(), e);
        }
    }
  }
  public StringList read() throws IOException {
    String line = lineStream.read();
    StringList name = null;

    if ((line != null) && (!StringUtil.isEmpty(line))) {
      String name2;
      // find the location of the name separator in the line of data.
      int pos = line.indexOf(' ');
      if ((pos != -1)) {
        String parsed = line.substring(0, pos);
        // the data is in ALL CAPS ... so the easiest way is to convert
        // back to standard mixed case.
        if ((parsed.length() > 2) && (parsed.startsWith("MC"))) {
          name2 =
              parsed.substring(0, 1).toUpperCase(locale)
                  + parsed.substring(1, 2).toLowerCase(locale)
                  + parsed.substring(2, 3).toUpperCase(locale)
                  + parsed.substring(3).toLowerCase(locale);
        } else {
          name2 =
              parsed.substring(0, 1).toUpperCase(locale) + parsed.substring(1).toLowerCase(locale);
        }
        name = new StringList(new String[] {name2});
      }
    }

    return name;
  }
  /**
   * Creates a dictionary.
   *
   * @param sampleStream stream of samples.
   * @return a {@code Dictionary} class containing the name dictionary built from the input file.
   * @throws IOException IOException
   */
  public static Dictionary createDictionary(ObjectStream<StringList> sampleStream)
      throws IOException {

    Dictionary mNameDictionary = new Dictionary(true);
    StringList entry;

    entry = sampleStream.read();
    while (entry != null) {
      if (!mNameDictionary.contains(entry)) {
        mNameDictionary.put(entry);
      }
      entry = sampleStream.read();
    }

    return mNameDictionary;
  }
Example #6
0
  public void createModel(String modelType, String trainFile) throws IOException {
    Charset charset = Charset.forName("UTF-8");
    System.out.println("File path:" + trainFile);
    ObjectStream<String> lineStream =
        new PlainTextByLineStream(
            new FileInputStream(ModelTypes.TRAIN_FILE_BASE_LOCATION + trainFile + ".train"),
            charset);
    ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);

    TokenNameFinderModel model;

    BufferedOutputStream modelOut = null;
    try {

      Collections.<String, Object>emptyMap();

      /** model = NameFinderME.train("en", modelType, sampleStream, null); */
      model =
          NameFinderME.train(
              "en",
              modelType,
              sampleStream,
              (AdaptiveFeatureGenerator) null,
              Collections.<String, Object>emptyMap(),
              100,
              1);
      /**
       * model= NameFinderME.train("en", modelType, sampleStream,
       * Collections.<String,Object>emptyMap(), 70, 1);
       */
    } finally {
      sampleStream.close();
    }

    try {
      modelOut =
          new BufferedOutputStream(
              new FileOutputStream(ModelTypes.BIN_FILE_BASE_LOCATION + trainFile + ".bin"));
      model.serialize(modelOut);
    } finally {
      if (modelOut != null) modelOut.close();
    }
  }
  public static Dictionary buildNGramDictionary(ObjectStream<POSSample> samples, int cutoff)
      throws IOException {

    NGramModel ngramModel = new NGramModel();

    POSSample sample;
    while ((sample = samples.read()) != null) {
      String[] words = sample.getSentence();

      if (words.length > 0) ngramModel.add(new StringList(words), 1, 1);
    }

    ngramModel.cutoff(cutoff, Integer.MAX_VALUE);

    return ngramModel.toDictionary(true);
  }
  public static void populatePOSDictionary(
      ObjectStream<POSSample> samples, MutableTagDictionary dict, int cutoff) throws IOException {
    System.out.println("Expanding POS Dictionary ...");
    long start = System.nanoTime();

    // the data structure will store the word, the tag, and the number of
    // occurrences
    Map<String, Map<String, AtomicInteger>> newEntries =
        new HashMap<String, Map<String, AtomicInteger>>();
    POSSample sample;
    while ((sample = samples.read()) != null) {
      String[] words = sample.getSentence();
      String[] tags = sample.getTags();

      for (int i = 0; i < words.length; i++) {
        // only store words
        if (!StringPattern.recognize(words[i]).containsDigit()) {
          String word;
          if (dict.isCaseSensitive()) {
            word = words[i];
          } else {
            word = StringUtil.toLowerCase(words[i]);
          }

          if (!newEntries.containsKey(word)) {
            newEntries.put(word, new HashMap<String, AtomicInteger>());
          }

          String[] dictTags = dict.getTags(word);
          if (dictTags != null) {
            for (String tag : dictTags) {
              // for this tags we start with the cutoff
              Map<String, AtomicInteger> value = newEntries.get(word);
              if (!value.containsKey(tag)) {
                value.put(tag, new AtomicInteger(cutoff));
              }
            }
          }

          if (!newEntries.get(word).containsKey(tags[i])) {
            newEntries.get(word).put(tags[i], new AtomicInteger(1));
          } else {
            newEntries.get(word).get(tags[i]).incrementAndGet();
          }
        }
      }
    }

    // now we check if the word + tag pairs have enough occurrences, if yes we
    // add it to the dictionary
    for (Entry<String, Map<String, AtomicInteger>> wordEntry : newEntries.entrySet()) {
      List<String> tagsForWord = new ArrayList<String>();
      for (Entry<String, AtomicInteger> entry : wordEntry.getValue().entrySet()) {
        if (entry.getValue().get() >= cutoff) {
          tagsForWord.add(entry.getKey());
        }
      }
      if (tagsForWord.size() > 0) {
        dict.put(wordEntry.getKey(), tagsForWord.toArray(new String[tagsForWord.size()]));
      }
    }

    System.out.println(
        "... finished expanding POS Dictionary. [" + (System.nanoTime() - start) / 1000000 + "ms]");
  }
 public void close() throws IOException {
   lineStream.close();
 }
 public void reset() throws IOException, UnsupportedOperationException {
   lineStream.reset();
 }
  public NameSample read() throws IOException {

    List<String> tokens = new ArrayList<String>();
    List<String> neTypes = new ArrayList<String>();
    boolean isClearAdaptiveData = false;

    // Empty line indicates end of sentence
    String line;
    while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line)) {
      // clear adaptive data if document mark appears following
      // CoNLL03 conventions
      if (clearFeatures.equalsIgnoreCase("docstart") && line.startsWith("-DOCSTART-")) {
        isClearAdaptiveData = true;
        String emptyLine = lineStream.read();
        if (!StringUtil.isEmpty(emptyLine))
          throw new IOException("Empty line after -DOCSTART- not empty: '" + emptyLine + "'!");
        continue;
      }
      String fields[] = line.split("\t");
      if (fields.length == 2) {
        tokens.add(fields[0]);
        neTypes.add(fields[1]);
      } else {
        throw new IOException(
            "Expected two fields per line in training data, got "
                + fields.length
                + " for line '"
                + line
                + "'!");
      }
    }
    // if no -DOCSTART- mark, check if we need to clear features every sentence
    if (clearFeatures.equalsIgnoreCase("yes")) {
      isClearAdaptiveData = true;
    }

    if (tokens.size() > 0) {
      // convert name tags into spans
      List<Span> names = new ArrayList<Span>();

      int beginIndex = -1;
      int endIndex = -1;
      for (int i = 0; i < neTypes.size(); i++) {
        String neTag = neTypes.get(i);
        if (neTag.equals("O")) {
          // O means we don't have anything this round.
          if (beginIndex != -1) {
            names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex)));
            beginIndex = -1;
            endIndex = -1;
          }
        } else if (neTag.startsWith("B-")) {
          // B- prefix means we have two same entities of the same class next to each other
          if (beginIndex != -1) {
            names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex)));
          }
          beginIndex = i;
          endIndex = i + 1;
        } else if (neTag.startsWith("I-")) {
          // I- starts or continues a current name entity
          if (beginIndex == -1) {
            beginIndex = i;
            endIndex = i + 1;
          } else if (!neTag.endsWith(neTypes.get(beginIndex).substring(1))) {
            // we have a new tag type following a tagged word series
            // also may not have the same I- starting the previous!
            names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex)));
            beginIndex = i;
            endIndex = i + 1;
          } else {
            endIndex++;
          }
        } else {
          throw new IOException("Invalid tag: " + neTag);
        }
      }

      // if one span remains, create it here
      if (beginIndex != -1) names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex)));

      return new NameSample(
          tokens.toArray(new String[tokens.size()]),
          names.toArray(new Span[names.size()]),
          isClearAdaptiveData);
    } else if (line != null) {
      // Just filter out empty events, if two lines in a row are empty
      return read();
    } else {
      // source stream is not returning anymore lines
      return null;
    }
  }