@Override
  public synchronized String nextSentence() {
    if (sentences == null || !sentences.hasNext()) {
      try {
        if (getReader().hasNext()) {
          CAS cas = resource.retrieve();

          try {
            getReader().getNext(cas);
          } catch (Exception e) {
            log.warn("Done iterating returning an empty string");
            return "";
          }

          resource.getAnalysisEngine().process(cas);

          List<String> list = new ArrayList<>();
          for (Sentence sentence : JCasUtil.select(cas.getJCas(), Sentence.class)) {
            list.add(sentence.getCoveredText());
          }

          sentences = list.iterator();
          // needs to be next cas
          while (!sentences.hasNext()) {
            // sentence is empty; go to another cas
            if (reader.hasNext()) {
              cas.reset();
              getReader().getNext(cas);
              resource.getAnalysisEngine().process(cas);
              for (Sentence sentence : JCasUtil.select(cas.getJCas(), Sentence.class)) {
                list.add(sentence.getCoveredText());
              }
              sentences = list.iterator();
            } else return null;
          }

          String ret = sentences.next();
          if (this.getPreProcessor() != null) ret = this.getPreProcessor().preProcess(ret);
          return ret;
        }

        return null;

      } catch (Exception e) {
        throw new RuntimeException(e);
      }

    } else {
      String ret = sentences.next();
      if (this.getPreProcessor() != null) ret = this.getPreProcessor().preProcess(ret);
      return ret;
    }
  }
  /** @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) */
  public void getNext(CAS aCAS) throws IOException, CollectionException {
    JCas jcas;
    try {
      jcas = aCAS.getJCas();
    } catch (CASException e) {
      throw new CollectionException(e);
    }

    // open input stream to file
    File file = (File) mFiles.get(mCurrentIndex++);
    String text = FileUtils.file2String(file, mEncoding);
    // put document in CAS
    jcas.setDocumentText(text);

    // set language if it was explicitly specified as a configuration parameter
    if (mLanguage != null) {
      jcas.setDocumentLanguage(mLanguage);
    }

    // Also store location of source document in CAS. This information is critical
    // if CAS Consumers will need to know where the original document contents are located.
    // For example, the Semantic Search CAS Indexer writes this information into the
    // search index that it creates, which allows applications that use the search index to
    // locate the documents that satisfy their semantic queries.
    SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
    srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString());
    srcDocInfo.setOffsetInSource(0);
    srcDocInfo.setDocumentSize((int) file.length());
    srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size());
    srcDocInfo.addToIndexes();
  }
Esempio n. 3
0
  @Override
  public void getNext(CAS aCAS) throws IOException, CollectionException {
    JCas jcas;
    try {
      jcas = aCAS.getJCas();
    } catch (CASException e) {
      throw new CollectionException(e);
    }

    try {
      // parse the xml file
      File xmlFile = GlobalFileStorage.getInstance().poll();

      System.out.println("Process file: " + xmlFile.getName());

      SAXParserFactory spf = SAXParserFactory.newInstance();
      SAXParser sp = spf.newSAXParser();
      XMLReader xr = sp.getXMLReader();

      LinkedList<String[]> textElements = new LinkedList<>();
      FragmentContentHandler fch = new FragmentContentHandler(xr, textElements);
      xr.setContentHandler(fch);
      xr.parse(new InputSource(new FileInputStream(xmlFile)));

      StringBuilder docText = new StringBuilder();

      for (String[] element : textElements) {

        int start = docText.length();
        int end = start + element[1].length();

        docText.append(element[1] + "\n\n");

        Section section = new Section(jcas, start, end);
        section.setValue(element[0]);
        section.addToIndexes();
      }

      jcas.setDocumentText(docText.toString().trim());
      jcas.setDocumentLanguage(language);

      DocumentMetaData docMetaData = DocumentMetaData.create(aCAS);
      docMetaData.setDocumentTitle(xmlFile.getName());
      docMetaData.setDocumentId(xmlFile.getAbsolutePath());
      docMetaData.setDocumentBaseUri("file:" + xmlFile.getParentFile().getAbsolutePath());
      docMetaData.setDocumentUri("file:" + xmlFile.getAbsolutePath());

    } catch (Exception e) {
      // e.printStackTrace();
      throw new CollectionException(e);
    }
  }
  /*
   * Method that reads all serialized cases in the specified folder
   * param: path --> specifies the folder
   * returns a list of jcases
   * */
  public List<JCas> read(String path) throws Exception {
    List<JCas> jCases = new ArrayList<JCas>();
    System.out.println("--- READING ---");
    @SuppressWarnings("deprecation")
    CollectionReader reader =
        CollectionReaderFactory.createReader(
            BinaryCasReader.class,
            ResourceCollectionReaderBase.PARAM_PATH,
            path,
            ResourceCollectionReaderBase.PARAM_PATTERNS,
            new String[] {ResourceCollectionReaderBase.INCLUDE_PREFIX + "*.bin"});

    while (reader.hasNext()) {
      CAS cas = CasCreationUtils.createCas(createTypeSystemDescription(), null, null);
      reader.getNext(cas);
      System.out.println(cas.getJCas().getDocumentText());

      jCases.add(cas.getJCas());
    }

    return jCases;
  }
  /**
   * TODO :: 1. construct the global word dictionary 2. keep the word frequency for each sentence
   *
   * <p>Creates two dictionaries queryDictionary and answerDictionary
   *
   * <p>queryDictionary is list of maps with key as the words in the question and value as the count
   * of the word in the question sentence. Similarly answerDictionary is list of maps with key as
   * the words in the answer and value as the count of the word in the answer sentence.
   */
  @Override
  public void processCas(CAS aCas) throws ResourceProcessException {

    JCas jcas;
    try {
      jcas = aCas.getJCas();
    } catch (CASException e) {
      throw new ResourceProcessException(e);
    }

    FSIterator it = jcas.getAnnotationIndex(Document.type).iterator();

    if (it.hasNext()) {
      Document doc = (Document) it.next();

      // Make sure that your previous annotators have populated this in CAS
      FSList fsTokenList = doc.getTokenList();
      ArrayList<Token> tokenList = Utils.fromFSListToCollection(fsTokenList, Token.class);

      HashMap<String, Integer> myMap = new HashMap<String, Integer>();
      HashMap<String, Integer> myMap2 = new HashMap<String, Integer>();

      // if question then fill QuesqIdList, QuesrelList & queryDictionary
      if (doc.getRelevanceValue() == 99) {
        QuesqIdList.add(doc.getQueryID());
        QuesrelList.add(doc.getRelevanceValue());
        for (int k = 0; k < tokenList.size(); k++) {
          myMap.put(tokenList.get(k).getText(), tokenList.get(k).getFrequency());
        }
        queryDictionary.add(myMap);
      }
      // if answer then fill AnsqIdList, AnsrelList & answerDictionary
      else {
        AnsqIdList.add(doc.getQueryID());
        AnsrelList.add(doc.getRelevanceValue());
        for (int k = 0; k < tokenList.size(); k++) {
          myMap2.put(tokenList.get(k).getText(), tokenList.get(k).getFrequency());
        }
        answerDictionary.add(myMap2);
        if (1 == doc.getRelevanceValue()) {
          GoldAnswerStringList.put(doc.getQueryID(), doc.getText());
        }
      }

      // Do something useful here

      /*for(int i=0;i<tokenList.size();i++)
        System.out.print(tokenList.get(i).getText().toString()+"=>" + tokenList.get(i).getFrequency()+"\t");
      System.out.println();*/
    }
  }
  public void getNext(CAS aCAS) throws IOException, CollectionException {
    JCas jcas;

    try {
      jcas = aCAS.getJCas();
    } catch (CASException e) {
      throw new CollectionException(e);
    }

    fillJCas(jcas);

    // give an indicator that a file has been processed
    System.err.print(".");
  }
  @Override
  public void getNext(CAS aCAS) throws IOException, CollectionException {
    // TODO Auto-generated method stub
    JCas jcas;

    try {
      jcas = aCAS.getJCas();
    } catch (CASException e) {
      throw new CollectionException(e);
    }

    // open input stream to file
    String sentence = mSentences.get(mCurrentIndex++);

    // put document in CAS
    jcas.setDocumentText(sentence);
  }
Esempio n. 8
0
  /**
   * Gets the next sentence from the input file.
   *
   * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
   */
  @Override
  public void getNext(CAS aCas) throws IOException, CollectionException {
    JCas jcas = null;
    try {
      jcas = aCas.getJCas();
    } catch (CASException e) {
      throw new CollectionException(e);
    }

    String lineString = mBufferdReader.readLine().trim();
    String sentenceId = lineString.substring(0, lineString.indexOf(" "));
    String sentenceText = lineString.substring(lineString.indexOf(" "));

    jcas.setDocumentText(sentenceText);
    Sentence sentence = new Sentence(jcas);
    sentence.setSentenceId(sentenceId);
    sentence.addToIndexes();
  }
  @Override
  public void getNext(CAS aCAS) throws IOException, CollectionException {
    super.getNext(aCAS);

    JCas jcas;
    try {
      jcas = aCAS.getJCas();
      // consider a tweet to be a sentence
      Sentence sentenceAnno = new Sentence(jcas);
      sentenceAnno.setBegin(0);
      sentenceAnno.setEnd(jcas.getDocumentText().length());
      sentenceAnno.addToIndexes();
    } catch (CASException e) {
      throw new CollectionException();
    }

    TextClassificationOutcome outcome = new TextClassificationOutcome(jcas);
    outcome.setOutcome(getTextClassificationOutcome(jcas));
    outcome.addToIndexes();
  }
Esempio n. 10
0
  /**
   * Processes the CAS which was populated by the TextAnalysisEngines. <br>
   * In this case, the CAS is converted to XMI and written into the output file .
   *
   * @param aCAS a CAS which has been populated by the TAEs
   * @throws ResourceProcessException if there is an error in processing the Resource
   * @see org.apache.uima.collection.base_cpm.CasObjectProcessor#processCas(org.apache.uima.cas.CAS)
   */
  public void processCas(CAS aCAS) throws ResourceProcessException {
    String modelFileName = null;

    JCas jcas;
    try {
      jcas = aCAS.getJCas();
    } catch (CASException e) {
      throw new ResourceProcessException(e);
    }

    // retrieve the filename of the input file from the CAS
    FSIterator it = jcas.getAnnotationIndex(SourceDocumentInformation.type).iterator();
    File outFile = null;
    if (it.hasNext()) {
      SourceDocumentInformation fileLoc = (SourceDocumentInformation) it.next();
      File inFile;
      try {
        inFile = new File(new URL(fileLoc.getUri()).getPath());
        String outFileName = inFile.getName();
        if (fileLoc.getOffsetInSource() > 0) {
          outFileName += ("_" + fileLoc.getOffsetInSource());
        }
        outFileName += ".xmi";
        outFile = new File(mOutputDir, outFileName);
        modelFileName = mOutputDir.getAbsolutePath() + "/" + inFile.getName() + ".ecore";
      } catch (MalformedURLException e1) {
        // invalid URL, use default processing below
      }
    }
    if (outFile == null) {
      outFile = new File(mOutputDir, "doc" + mDocNum++ + ".xmi");
    }
    // serialize XCAS and write to output file
    try {
      writeXmi(jcas.getCas(), outFile, modelFileName);
    } catch (IOException e) {
      throw new ResourceProcessException(e);
    } catch (SAXException e) {
      throw new ResourceProcessException(e);
    }
  }
Esempio n. 11
0
  /** @see com.ibm.uima.collection.CollectionReader#getNext(com.ibm.uima.cas.CAS) */
  public void getNext(CAS aCAS) throws IOException, CollectionException {
    JCas jcas;
    try {
      jcas = aCAS.getJCas();
    } catch (CASException e) {
      throw new CollectionException(e);
    }
    String name = fileIterator.next().getAbsolutePath();
    jcas.setDocumentText(ReadWriteTextFileWithEncoding.read(name, "UTF-8"));
    numberOfFilesProcessed++;
    try {
      name = filenameToIDTranslator.cleanItUp(name);
    } catch (StringCleanerException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    StringArray s = new StringArray(jcas, 1);
    s.set(0, filenameToIDTranslator.getIdType() + name);
    ISI_UIMA_Util.setDocumentSecondaryIDs(jcas, s);
  }
Esempio n. 12
0
	public void entityProcessComplete(CAS aCas, EntityProcessStatus aStatus) {
		if (aStatus != null) {
			if (aStatus.isException()) {
				System.err.println("Error on process CAS call to remote service:");
				List<Exception> exceptions = aStatus.getExceptions();
				for (int i = 0; i < exceptions.size(); i++) {
					((Throwable) exceptions.get(i)).printStackTrace();
				}
			}
			
			try {
				JCas cas = aCas.getJCas();

				for(Token token : JCasUtil.select(cas, Token.class)) {
					System.out.println(token.getCoveredText() + " " + token.getPos().getPosValue());
				}

			} catch (CASException e) {
				e.printStackTrace();
			}
		}
	}
  @Override
  public void getNext(CAS cas) throws IOException, CollectionException {
    this.cumulatedLength += currentDoc.getText().length();
    logger.info(
        "[Stream {}] Processing document {}: {} (total length processed: {})",
        this.streamName,
        this.mCurrentIndex,
        this.currentDoc.getUri(),
        this.cumulatedLength);

    SourceDocumentInformation sdi;
    try {

      sdi = new SourceDocumentInformation(cas.getJCas());
      sdi.setUri(currentDoc.getUri());
      cas.setDocumentLanguage(mLanguage.getCode());
      cas.setDocumentText(currentDoc.getText());
      sdi.setDocumentSize(currentDoc.getText().length());
      sdi.setCumulatedDocumentSize(this.cumulatedLength);
      sdi.setBegin(0);
      sdi.setEnd(currentDoc.getText().length());
      sdi.setOffsetInSource(0);
      sdi.setDocumentIndex(mCurrentIndex);

      /*
       * Cannot be known in case of streaming
       */
      sdi.setCorpusSize(-1);
      sdi.setNbDocuments(-1);

      // Cannot know if this is the last
      sdi.setLastSegment(false);

      sdi.addToIndexes();
      this.mCurrentIndex++;
    } catch (CASException e) {
      throw new CollectionException(e);
    }
  }
Esempio n. 14
0
 @Override
 public void process(CAS cas) throws AnalysisEngineProcessException {
   JCas textJCas;
   try {
     textJCas = cas.getJCas();
     setStream(textJCas);
   } catch (CASException e1) {
     throw new AnalysisEngineProcessException(e1);
   } catch (final IOException e2) {
     throw new AnalysisEngineProcessException(e2);
   }
   final FSIterator<Annotation> annotationIt = SemanticAnnotation.getIterator(textJCas);
   while (annotationIt.hasNext()) {
     final SemanticAnnotation ann = (SemanticAnnotation) annotationIt.next();
     final String text =
         replaceNewlines
             ? StringUtils.join(' ', ann.getCoveredText().split(LINEBREAK))
             : ann.getCoveredText();
     try {
       write(ann.getNamespace());
       write(fieldSeparator);
       write(ann.getIdentifier());
       write(fieldSeparator);
       write(ann.getOffset().toString());
       write(fieldSeparator);
       write(text);
       write(LINEBREAK);
     } catch (final IOException e) {
       throw new AnalysisEngineProcessException(e);
     }
   }
   try {
     unsetStream();
   } catch (final IOException e) {
     throw new AnalysisEngineProcessException(e);
   }
 }
  /**
   * CasConsumer would use tags and features to write output file, evaluate and print precision,
   * recall and F-1 measure.
   *
   * @param arg0
   * @throws ResourceProcessException
   */
  @Override
  public void processCas(CAS arg0) throws ResourceProcessException {
    /** convert type of arg0 */
    JCas jcas = null;
    try {
      jcas = arg0.getJCas();
    } catch (CASException e1) {
      // TODO Auto-generated catch block
      e1.printStackTrace();
    }
    // TODO Auto-generated method stub
    FSIterator<Annotation> ite = jcas.getAnnotationIndex(WordTag.type).iterator();

    while (ite.hasNext()) {
      /** collect features */
      String id = ((WordTag) ite.get()).getId();
      int begin = ((WordTag) ite.get()).getBegin0();
      int end = ((WordTag) ite.get()).getEnd0();
      String name = ((WordTag) ite.get()).getName();

      /** organize string for output */
      report.append(id);
      report.append("|");
      report.append(begin);
      report.append(" ");
      report.append(end);
      report.append("|");
      report.append(name);
      report.append("\n");

      /** count the length of output string */
      count++;
      ite.next();
    }

    result = report.toString();
    File sampleOut = new File("src/main/resources/data/sample.out");
    try {
      testRecall = FileUtils.file2String(sampleOut);
    } catch (IOException e1) {
      // TODO Auto-generated catch block
      e1.printStackTrace();
    }

    /** split strings from file into sentences */
    String[] resultSplit = result.split("\n");
    String[] recallSplit = testRecall.split("\n");
    PrecisionRecallCalculator(recallSplit, resultSplit);

    /** write the output file to the project root */
    String path = "hw1-longh.out";
    File dirFile = new File(path);

    /** make sure no conflict */
    if (dirFile.exists()) {
      dirFile.delete();
    }

    try {
      /** write file */
      BufferedWriter bw1 = new BufferedWriter(new FileWriter(path, true));
      bw1.write(report.toString());
      bw1.flush();
      bw1.close();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
  /* (non-Javadoc)
   * @see org.apache.uima.collection.base_cpm.CasObjectProcessor#processCas(org.apache.uima.cas.CAS)
   */
  public void processCas(CAS aCAS) throws ResourceProcessException {
    JCas jcas;
    try {
      jcas = aCAS.getJCas();
    } catch (CASException e) {
      logger.log(Level.SEVERE, e.getMessage());
      throw new ResourceProcessException(e);
    }

    TweetAnnotation tweetAnn =
        (TweetAnnotation) jcas.getAnnotationIndex(TweetAnnotation.type).iterator().next();
    OMTweet answerTweet = evalCorpusReader.next();

    if (!answerTweet.getId().equals(tweetAnn.getId())) {
      logger.log(
          Level.SEVERE,
          "target corpus and evaluation corpus don't match to each other - "
              + answerTweet.getId()
              + ", "
              + tweetAnn.getId());
      throw new ResourceProcessException();
    }

    String[] entity = extractEntityTags(answerTweet.getText());

    String classified = null;
    String prevClassified = null;
    StringBuffer sb = new StringBuffer();
    try {
      sb.append("\n[");
      sb.append(answerTweet.getPolarityString());
      sb.append("=>");
      sb.append(tweetAnn.getPolarity());
      sb.append("] ");
      sb.append(tweetAnn.getCoveredText());
      sb.append('\n');

      FSIterator<Annotation> tokenAnnIter =
          jcas.getAnnotationIndex(TokenAnnotation.type).iterator();
      TokenAnnotation tokenAnn = null;

      int i = 0;
      int prevClassifiedIdx = labelNoneIdx;
      int prevAnswerIdx = labelNoneIdx;
      String classifiedEntityStr = "";
      String answerEntityStr = "";

      while (tokenAnnIter.hasNext()) {
        tokenAnn = (TokenAnnotation) tokenAnnIter.next();

        classified = tokenAnn.getEntityLabel();
        String answer = entity[i];
        boolean correct = false;
        if (classified.equals(answer)) {
          correct = true;
        }

        int classifiedIdx = 0;
        int answerIdx = 0;
        try {
          answerIdx = map.get(answer);
        } catch (Exception e) {
          logger.log(
              Level.SEVERE,
              "wrong annotation on the evaluation corpus - tweet id: "
                  + answerTweet.getId()
                  + ", answerTag="
                  + answer);
          logger.log(Level.SEVERE, e.getMessage());
          answerIdx = map.get(labelNone);
        }
        try {
          classifiedIdx = map.get(classified);
        } catch (Exception e) {
          logger.log(
              Level.SEVERE,
              "wrong annotation from the NER - tweet id: "
                  + answerTweet.getId()
                  + ", classifiedTag="
                  + classified);
          logger.log(Level.SEVERE, e.getMessage());
          classifiedIdx = map.get(labelNone);
        }

        stat[classifiedIdx][0]++;
        stat[answerIdx][1]++;

        if (correct) {
          stat[classifiedIdx][2]++;
        }

        if (classifiedIdx != labelNoneIdx) {
          if (classifiedIdx / 3 != prevClassifiedIdx / 3) {
            classifiedEntityCnt[classifiedIdx / 3]++;
            if (prevClassifiedIdx != labelNoneIdx) {
              sb.append('\t');
              sb.append(classifiedEntityStr);
              sb.append(" -> ");
              sb.append(prevClassified.substring(0, prevClassified.lastIndexOf('_')));
              sb.append('\n');
            }
            classifiedEntityStr = tokenAnn.getCoveredText();
          } else {
            classifiedEntityStr += " " + tokenAnn.getCoveredText();
          }
        } else if (prevClassifiedIdx != labelNoneIdx) {
          sb.append('\t');
          sb.append(classifiedEntityStr);
          sb.append(" -> ");
          sb.append(prevClassified.substring(0, prevClassified.lastIndexOf('_')));
          sb.append('\n');
          classifiedEntityStr = "";
        }
        prevClassifiedIdx = classifiedIdx;

        if (answerIdx != labelNoneIdx) {
          if (answerIdx / 3 != prevAnswerIdx / 3) {
            answerEntityCnt[answerIdx / 3]++;
            answerEntityStr = tokenAnn.getCoveredText();
          } else {
            answerEntityStr += " " + tokenAnn.getCoveredText();
          }
        } else if (prevAnswerIdx != labelNoneIdx) {
          answerEntityStr = "";
        }

        prevAnswerIdx = answerIdx;
        prevClassified = classified;
        i++;
      }
      if (prevClassifiedIdx != labelNoneIdx) {
        sb.append('\t');
        sb.append(classifiedEntityStr);
        sb.append(" -> ");
        sb.append(prevClassified.substring(0, prevClassified.lastIndexOf('_')));
        sb.append('\n');
      }

      // senti
      String answerSenti = answerTweet.getPolarityString();

      boolean correct = false;
      String classifiedSenti = tweetAnn.getPolarity();
      if (classifiedSenti.equals(senti)) {
        correct = true;
      }

      int classifiedIdx = sentiIdx(classifiedSenti);
      int answerIdx = sentiIdx(answerSenti);

      senti[classifiedIdx][0]++;
      senti[answerIdx][1]++;
      if (classifiedIdx == answerIdx) {
        correct = true;
      }

      if (correct) {
        senti[classifiedIdx][2]++;
      }
      cnt++;

      logger.log(Level.INFO, sb.toString());

    } catch (CASRuntimeException e) {
      throw new ResourceProcessException(e);
    }
  }
  /** @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) */
  public void getNext(CAS aCAS) throws IOException, CollectionException {
    JCas jcas;
    try {
      jcas = aCAS.getJCas();
    } catch (CASException e) {
      throw new CollectionException(e);
    }

    // put document in CAS
    jcas.setDocumentText(mNextText.getText());

    for (Paragraph para : mNextText.getParagraphs()) {
      GoldenParagraph p = new GoldenParagraph(jcas);

      for (SentenceEx sent : para.getSentences()) {
        GoldenSentence s = new GoldenSentence(jcas);
        s.setId(sent.getId());
        s.setBegin(sent.getStart());
        s.setEnd(sent.getEnd());

        List<GrEr> grers = sent.getGrammarErrors();
        if (grers.size() > 0) {
          FSArray fsarr = new FSArray(jcas, grers.size());
          for (int j = 0; j < grers.size(); j++) {
            GoldenGrammarError ge = new GoldenGrammarError(jcas);
            ge.setBegin(grers.get(j).getStart());
            ge.setEnd(grers.get(j).getEnd());
            ge.setCategory(grers.get(j).getCat());
            ge.setError(grers.get(j).getErr());
            ge.setReplace(grers.get(j).getRep());
            ge.addToIndexes();
            fsarr.set(j, ge);
          }
          s.setGoldenGrammarErrors(fsarr);
        }

        s.addToIndexes();
      }

      p.setId(para.getId());
      p.setBegin(para.getStart());
      p.setEnd(para.getEnd());
      p.addToIndexes();
    }

    // set language if it was explicitly specified as a configuration parameter
    if (mLanguage != null) {
      ((DocumentAnnotation) jcas.getDocumentAnnotationFs()).setLanguage(mLanguage);
    }

    // Also store location of source document in CAS. This information is
    // critical
    // if CAS Consumers will need to know where the original document contents
    // are located.
    // For example, the Semantic Search CAS Indexer writes this information into
    // the
    // search index that it creates, which allows applications that use the
    // search index to
    // locate the documents that satisfy their semantic queries.
    // SourceDocumentInformation srcDocInfo = new
    // SourceDocumentInformation(jcas);
    // srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString());
    // srcDocInfo.setOffsetInSource(0);
    // srcDocInfo.setDocumentSize((int) file.length());
    // srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size());
    // srcDocInfo.addToIndexes();

    mCurrentText++;
    mNextText = mMultiReader.read();
  }
Esempio n. 18
0
  /** @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) */
  public void getNext(CAS aCAS) throws IOException, CollectionException {
    JCas jcas;
    try {
      jcas = aCAS.getJCas();
    } catch (CASException e) {
      throw new CollectionException(e);
    }

    // open input stream to file
    File file = (File) mFiles.get(mCurrentIndex++);

    System.out.println("Reading file: " + file.getAbsolutePath());

    // read zipped file
    String text;
    BufferedReader reader =
        new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file))));

    StringBuffer textBuffer = new StringBuffer();
    Integer currindex = -1;
    while (reader.ready()) {
      PubmedDocument pmdoc = new PubmedDocument(jcas);

      String s = reader.readLine();
      // System.out.println(s);

      // split line into pmid and text
      String[] two = new String[2];
      two = splitFirst(s, "\t");
      pmdoc.setPmid(two[0]);

      String annot = new String(two[1]);
      // append text
      textBuffer.append(annot + "\n");
      // pmdoc.setBegin(currindex + two[0].length() + 1);
      pmdoc.setBegin(currindex + 1);
      Integer len = annot.length();
      currindex = currindex + len + 1;
      pmdoc.setEnd(currindex);

      //	    System.out.println(	"pmid: "+two[0] + "\t" +
      //	    					"[begin/end]:"+ pmdoc.getBegin() + "/" + pmdoc.getEnd() + "\t" +
      //	    					"annot:" + annot
      //	    					);

      // System.out.println(annot.substring(pmdoc.getBegin(), pmdoc.getEnd()));
      pmdoc.addToIndexes();
    }

    text = textBuffer.toString();
    // System.out.println(text);

    // old File to String Method
    // String text = FileUtils.file2String(file, mEncoding);

    // put document in CAS
    jcas.setDocumentText(text);

    // set language if it was explicitly specified as a configuration parameter
    if (mLanguage != null) {
      ((DocumentAnnotation) jcas.getDocumentAnnotationFs()).setLanguage(mLanguage);
    }

    // Also store location of source document in CAS. This information is critical
    // if CAS Consumers will need to know where the original document contents are located.
    // For example, the Semantic Search CAS Indexer writes this information into the
    // search index that it creates, which allows applications that use the search index to
    // locate the documents that satisfy their semantic queries.
    SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
    srcDocInfo.setUri(file.getAbsoluteFile().toURI().toString());
    srcDocInfo.setOffsetInSource(0);
    srcDocInfo.setDocumentSize((int) file.length());
    srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size());
    srcDocInfo.setBegin(0);
    srcDocInfo.setEnd(currindex);
    srcDocInfo.addToIndexes();
  }