Пример #1
0
  private List<ConLLWord> addWordResourcesToModel(
      OntModel outputModel,
      List<ConLLWord> wordObjectsOfSentence,
      Individual sentenceResource,
      Individual contextResource) {
    for (ConLLWord word : wordObjectsOfSentence) {
      // generate URI differently
      String uri =
          contextResource.getURI().substring(0, contextResource.getURI().lastIndexOf("=") + 1)
              + word.getStart()
              + ","
              + word.getEnd();
      Individual wordResource =
          outputModel.createIndividual(
              uri, outputModel.createClass(NIFOntClasses.RFC5147String.getUri()));
      wordResource.addOntClass(NIFOntClasses.Word.getOntClass(outputModel));
      wordResource.addOntClass(NIFOntClasses.String.getOntClass(outputModel));
      wordResource.addProperty(
          NIFDatatypeProperties.beginIndex.getDatatypeProperty(outputModel), word.getStart() + "");
      wordResource.addProperty(
          NIFDatatypeProperties.endIndex.getDatatypeProperty(outputModel), word.getEnd() + "");
      wordResource.addLiteral(
          NIFDatatypeProperties.anchorOf.getDatatypeProperty(outputModel),
          outputModel.createLiteral(word.getWordString()));
      wordResource.addProperty(
          NIFDatatypeProperties.posTag.getDatatypeProperty(outputModel), word.getPos());

      if (tagsetKnown) addOliaPos(wordResource, word.getPos(), outputModel);

      wordResource.addProperty(
          NIFObjectProperties.referenceContext.getObjectProperty(outputModel), contextResource);
      // TODO: we may need something different here
      if (!word.getPosFine().equals("_"))
        wordResource.addProperty(
            NIFDatatypeProperties.posTag.getDatatypeProperty(outputModel), word.getPosFine());
      if (!word.getLemma().equals("_"))
        wordResource.addProperty(
            NIFDatatypeProperties.lemma.getDatatypeProperty(outputModel), word.getLemma());
      // TODO: add genus, numerus etc here
      wordResource.addProperty(
          NIFObjectProperties.sentence.getObjectProperty(outputModel), sentenceResource);
      word.setResource(wordResource);
      sentenceResource.addProperty(
          NIFObjectProperties.word.getObjectProperty(outputModel), wordResource);
      sentenceResource.addProperty(
          NIFObjectProperties.referenceContext.getObjectProperty(outputModel), contextResource);
    }
    return wordObjectsOfSentence;
  }
Пример #2
0
  /**
   * Parse a sentence into a list of ConLLWord objects
   *
   * @param sentence list of line strings from the ConLL file
   * @return
   */
  private List<ConLLWord> transformSentenceToObjects(List<String> sentence, int startOffset) {

    String sentenceString = "";
    List<ConLLWord> sentenceObjects = new ArrayList<ConLLWord>();
    int offset = startOffset;

    boolean addSpaceToOffset = false;
    for (String line : sentence) {

      ConLLWord word = getWordFromCorpusLine(line);
      // TODO: check if word is punctuation class word instead of this static check
      if (word.getWordString().matches("([\\.!?,;:)]|'+)")) {
        // remove whitespace before punctuation mark and decrement offset
        if (sentenceString.endsWith(" ")) {
          sentenceString = sentenceString.substring(0, sentenceString.length() - 1);
          offset = offset - 1;
        }

        sentenceString += word.getWordString();
        // no space here
      } else if (word.getWordString().matches("([(]|`+)")) {
        sentenceString += word.getWordString();
        // add a space after the word
      } else {
        sentenceString += word.getWordString() + " ";
        addSpaceToOffset = true;
      }

      word.setStart(offset);
      int wordEnd = offset + word.getWordString().length();
      word.setEnd(wordEnd);
      offset = wordEnd;
      if (addSpaceToOffset) {
        offset++;
        addSpaceToOffset = false;
      }

      sentenceObjects.add(word);
    }
    this.tempSentence = sentenceString.trim();

    // return the length of the sentence added to the offset
    return sentenceObjects;
  }