Ejemplo n.º 1
0
  /** @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) */
  public void getNext(CAS aCAS) throws IOException, CollectionException {
    JCas jcas;
    try {
      jcas = aCAS.getJCas();
    } catch (CASException e) {
      throw new CollectionException(e);
    }

    // open input stream to file
    File file = (File) mFiles.get(mCurrentIndex++);
    String text = FileUtils.file2String(file, mEncoding);
    // put document in CAS
    jcas.setDocumentText(text);

    // set language if it was explicitly specified as a configuration parameter
    if (mLanguage != null) {
      jcas.setDocumentLanguage(mLanguage);
    }

    // Also store location of source document in CAS. This information is critical
    // if CAS Consumers will need to know where the original document contents are located.
    // For example, the Semantic Search CAS Indexer writes this information into the
    // search index that it creates, which allows applications that use the search index to
    // locate the documents that satisfy their semantic queries.
    SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
    srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString());
    srcDocInfo.setOffsetInSource(0);
    srcDocInfo.setDocumentSize((int) file.length());
    srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size());
    srcDocInfo.addToIndexes();
  }
 /**
  * @param input the text to vectorize
  * @param label the label of the text
  * @return a dataset with a applyTransformToDestination of weights(relative to impl; could be word
  *     counts or tfidf scores)
  */
 @Override
 public DataSet vectorize(File input, String label) {
   try {
     String text = FileUtils.file2String(input);
     return vectorize(text, label);
   } catch (Exception e) {
     throw new RuntimeException(e);
   }
 }
  /**
   * CasConsumer would use tags and features to write output file, evaluate and print precision,
   * recall and F-1 measure.
   *
   * @param arg0
   * @throws ResourceProcessException
   */
  @Override
  public void processCas(CAS arg0) throws ResourceProcessException {
    /** convert type of arg0 */
    JCas jcas = null;
    try {
      jcas = arg0.getJCas();
    } catch (CASException e1) {
      // TODO Auto-generated catch block
      e1.printStackTrace();
    }
    // TODO Auto-generated method stub
    FSIterator<Annotation> ite = jcas.getAnnotationIndex(WordTag.type).iterator();

    while (ite.hasNext()) {
      /** collect features */
      String id = ((WordTag) ite.get()).getId();
      int begin = ((WordTag) ite.get()).getBegin0();
      int end = ((WordTag) ite.get()).getEnd0();
      String name = ((WordTag) ite.get()).getName();

      /** organize string for output */
      report.append(id);
      report.append("|");
      report.append(begin);
      report.append(" ");
      report.append(end);
      report.append("|");
      report.append(name);
      report.append("\n");

      /** count the length of output string */
      count++;
      ite.next();
    }

    result = report.toString();
    File sampleOut = new File("src/main/resources/data/sample.out");
    try {
      testRecall = FileUtils.file2String(sampleOut);
    } catch (IOException e1) {
      // TODO Auto-generated catch block
      e1.printStackTrace();
    }

    /** split strings from file into sentences */
    String[] resultSplit = result.split("\n");
    String[] recallSplit = testRecall.split("\n");
    PrecisionRecallCalculator(recallSplit, resultSplit);

    /** write the output file to the project root */
    String path = "hw1-longh.out";
    File dirFile = new File(path);

    /** make sure no conflict */
    if (dirFile.exists()) {
      dirFile.delete();
    }

    try {
      /** write file */
      BufferedWriter bw1 = new BufferedWriter(new FileWriter(path, true));
      bw1.write(report.toString());
      bw1.flush();
      bw1.close();
    } catch (IOException e) {
      e.printStackTrace();
    }
  }