예제 #1
0
  /**
   * Extracts location names from unstructured text using the named entity recognizer (NER) feature
   * provided by the Apache OpenNLP Name Finder.
   *
   * @param plainText Contents of text document
   * @return List of location names and positions
   */
  public List<LocationOccurrence> extractLocationNames(String plainText) {
    if (plainText == null) {
      throw new IllegalArgumentException(
          "plaintext input to extractLocationNames should not be null");
    }

    List<LocationOccurrence> nerResults = new ArrayList<LocationOccurrence>();

    // The values used in these Spans are string character offsets
    Span sentenceSpans[] = sentenceDetector.sentPosDetect(plainText);

    // Each sentence gets processed on its own
    for (Span sentenceSpan : sentenceSpans) {

      // find the start and end position of this sentence in the document
      String sentence = plainText.substring(sentenceSpan.getStart(), sentenceSpan.getEnd());

      // tokenize the text into the required OpenNLP format
      String[] tokens = tokenizer.tokenize(sentence);

      // the values used in these Spans are string character offsets of each token from the sentence
      // beginning
      Span[] tokenPositionsWithinSentence = tokenizer.tokenizePos(sentence);

      // find the location names in the tokenized text
      // the values used in these Spans are NOT string character offsets, they are indices into the
      // 'tokens' array
      Span names[] = nameFinder.find(tokens);

      // for each name that got found, create our corresponding occurrence
      for (Span name : names) {

        // find offsets relative to the start of the sentence
        int beginningOfFirstWord = tokenPositionsWithinSentence[name.getStart()].getStart();
        int endOfLastWord =
            tokenPositionsWithinSentence[name.getEnd() - 1]
                .getEnd(); // -1 because the high end of a Span is noninclusiv

        // to get offsets relative to the document as a whole, just add the offset for the sentence
        // itself
        int startOffsetInDoc = sentenceSpan.getStart() + beginningOfFirstWord;
        int endOffsetInDoc = sentenceSpan.getStart() + endOfLastWord;

        // look back into the original input string to figure out what the text is that I got a hit
        // on
        String nameInDocument = plainText.substring(startOffsetInDoc, endOffsetInDoc);

        // add to List of results to return
        nerResults.add(new LocationOccurrence(nameInDocument, startOffsetInDoc));
      }
    }

    // this is necessary to maintain consistent results across
    // multiple runs on the same data, which is what we want
    nameFinder.clearAdaptiveData();

    return nerResults;
  }
예제 #2
0
  public void createModel(String modelType, String trainFile) throws IOException {
    Charset charset = Charset.forName("UTF-8");
    System.out.println("File path:" + trainFile);
    ObjectStream<String> lineStream =
        new PlainTextByLineStream(
            new FileInputStream(ModelTypes.TRAIN_FILE_BASE_LOCATION + trainFile + ".train"),
            charset);
    ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);

    TokenNameFinderModel model;

    BufferedOutputStream modelOut = null;
    try {

      Collections.<String, Object>emptyMap();

      /** model = NameFinderME.train("en", modelType, sampleStream, null); */
      model =
          NameFinderME.train(
              "en",
              modelType,
              sampleStream,
              (AdaptiveFeatureGenerator) null,
              Collections.<String, Object>emptyMap(),
              100,
              1);
      /**
       * model= NameFinderME.train("en", modelType, sampleStream,
       * Collections.<String,Object>emptyMap(), 70, 1);
       */
    } finally {
      sampleStream.close();
    }

    try {
      modelOut =
          new BufferedOutputStream(
              new FileOutputStream(ModelTypes.BIN_FILE_BASE_LOCATION + trainFile + ".bin"));
      model.serialize(modelOut);
    } finally {
      if (modelOut != null) modelOut.close();
    }
  }
예제 #3
0
  public void analyze(Document document) {
    List<Sentence> sentences = document.getSentences();

    for (Sentence sentence : sentences) {
      Span[] contractionsSpan;

      synchronized (this.contractionFinder) {
        contractionsSpan = contractionFinder.find(TextUtils.tokensToString(sentence.getTokens()));
      }

      List<Token> newTokens = sentence.getTokens();

      for (int i = contractionsSpan.length - 1; i >= 0; i--) {

        int start = contractionsSpan[i].getStart();

        String lexeme = sentence.getTokens().get(start).getLexeme();
        String[] contractions = ContractionUtility.expand(lexeme);

        Token original = newTokens.remove(start);
        if (contractions != null) {
          for (int j = contractions.length - 1; j >= 0; j--) {
            Token token = new TokenImpl(original.getStart(), original.getEnd(), contractions[j]);
            newTokens.add(start, token);

            String caze = null;
            if (j == 0) caze = "B";
            else if (j == contractions.length - 1) caze = "E";
            else caze = "I";

            token.addContext(Analyzers.CONTRACTION_FINDER, caze);
          }
        } else {
          LOGGER.debug("Missing contraction: " + lexeme);
        }
      }
      sentence.setTokens(newTokens);
    }
  }