/** @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) */ public void getNext(CAS aCAS) throws IOException, CollectionException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new CollectionException(e); } // open input stream to file File file = (File) mFiles.get(mCurrentIndex++); String text = FileUtils.file2String(file, mEncoding); // put document in CAS jcas.setDocumentText(text); // set language if it was explicitly specified as a configuration parameter if (mLanguage != null) { jcas.setDocumentLanguage(mLanguage); } // Also store location of source document in CAS. This information is critical // if CAS Consumers will need to know where the original document contents are located. // For example, the Semantic Search CAS Indexer writes this information into the // search index that it creates, which allows applications that use the search index to // locate the documents that satisfy their semantic queries. SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas); srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString()); srcDocInfo.setOffsetInSource(0); srcDocInfo.setDocumentSize((int) file.length()); srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size()); srcDocInfo.addToIndexes(); }
/** * @param input the text to vectorize * @param label the label of the text * @return a dataset with a applyTransformToDestination of weights(relative to impl; could be word * counts or tfidf scores) */ @Override public DataSet vectorize(File input, String label) { try { String text = FileUtils.file2String(input); return vectorize(text, label); } catch (Exception e) { throw new RuntimeException(e); } }
/** * CasConsumer would use tags and features to write output file, evaluate and print precision, * recall and F-1 measure. * * @param arg0 * @throws ResourceProcessException */ @Override public void processCas(CAS arg0) throws ResourceProcessException { /** convert type of arg0 */ JCas jcas = null; try { jcas = arg0.getJCas(); } catch (CASException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } // TODO Auto-generated method stub FSIterator<Annotation> ite = jcas.getAnnotationIndex(WordTag.type).iterator(); while (ite.hasNext()) { /** collect features */ String id = ((WordTag) ite.get()).getId(); int begin = ((WordTag) ite.get()).getBegin0(); int end = ((WordTag) ite.get()).getEnd0(); String name = ((WordTag) ite.get()).getName(); /** organize string for output */ report.append(id); report.append("|"); report.append(begin); report.append(" "); report.append(end); report.append("|"); report.append(name); report.append("\n"); /** count the length of output string */ count++; ite.next(); } result = report.toString(); File sampleOut = new File("src/main/resources/data/sample.out"); try { testRecall = FileUtils.file2String(sampleOut); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } /** split strings from file into sentences */ String[] resultSplit = result.split("\n"); String[] recallSplit = testRecall.split("\n"); PrecisionRecallCalculator(recallSplit, resultSplit); /** write the output file to the project root */ String path = "hw1-longh.out"; File dirFile = new File(path); /** make sure no conflict */ if (dirFile.exists()) { dirFile.delete(); } try { /** write file */ BufferedWriter bw1 = new BufferedWriter(new FileWriter(path, true)); bw1.write(report.toString()); bw1.flush(); bw1.close(); } catch (IOException e) { e.printStackTrace(); } }