@Override public synchronized String nextSentence() { if (sentences == null || !sentences.hasNext()) { try { if (getReader().hasNext()) { CAS cas = resource.retrieve(); try { getReader().getNext(cas); } catch (Exception e) { log.warn("Done iterating returning an empty string"); return ""; } resource.getAnalysisEngine().process(cas); List<String> list = new ArrayList<>(); for (Sentence sentence : JCasUtil.select(cas.getJCas(), Sentence.class)) { list.add(sentence.getCoveredText()); } sentences = list.iterator(); // needs to be next cas while (!sentences.hasNext()) { // sentence is empty; go to another cas if (reader.hasNext()) { cas.reset(); getReader().getNext(cas); resource.getAnalysisEngine().process(cas); for (Sentence sentence : JCasUtil.select(cas.getJCas(), Sentence.class)) { list.add(sentence.getCoveredText()); } sentences = list.iterator(); } else return null; } String ret = sentences.next(); if (this.getPreProcessor() != null) ret = this.getPreProcessor().preProcess(ret); return ret; } return null; } catch (Exception e) { throw new RuntimeException(e); } } else { String ret = sentences.next(); if (this.getPreProcessor() != null) ret = this.getPreProcessor().preProcess(ret); return ret; } }
/** @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) */ public void getNext(CAS aCAS) throws IOException, CollectionException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new CollectionException(e); } // open input stream to file File file = (File) mFiles.get(mCurrentIndex++); String text = FileUtils.file2String(file, mEncoding); // put document in CAS jcas.setDocumentText(text); // set language if it was explicitly specified as a configuration parameter if (mLanguage != null) { jcas.setDocumentLanguage(mLanguage); } // Also store location of source document in CAS. This information is critical // if CAS Consumers will need to know where the original document contents are located. // For example, the Semantic Search CAS Indexer writes this information into the // search index that it creates, which allows applications that use the search index to // locate the documents that satisfy their semantic queries. SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas); srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString()); srcDocInfo.setOffsetInSource(0); srcDocInfo.setDocumentSize((int) file.length()); srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size()); srcDocInfo.addToIndexes(); }
@Override public void getNext(CAS aCAS) throws IOException, CollectionException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new CollectionException(e); } try { // parse the xml file File xmlFile = GlobalFileStorage.getInstance().poll(); System.out.println("Process file: " + xmlFile.getName()); SAXParserFactory spf = SAXParserFactory.newInstance(); SAXParser sp = spf.newSAXParser(); XMLReader xr = sp.getXMLReader(); LinkedList<String[]> textElements = new LinkedList<>(); FragmentContentHandler fch = new FragmentContentHandler(xr, textElements); xr.setContentHandler(fch); xr.parse(new InputSource(new FileInputStream(xmlFile))); StringBuilder docText = new StringBuilder(); for (String[] element : textElements) { int start = docText.length(); int end = start + element[1].length(); docText.append(element[1] + "\n\n"); Section section = new Section(jcas, start, end); section.setValue(element[0]); section.addToIndexes(); } jcas.setDocumentText(docText.toString().trim()); jcas.setDocumentLanguage(language); DocumentMetaData docMetaData = DocumentMetaData.create(aCAS); docMetaData.setDocumentTitle(xmlFile.getName()); docMetaData.setDocumentId(xmlFile.getAbsolutePath()); docMetaData.setDocumentBaseUri("file:" + xmlFile.getParentFile().getAbsolutePath()); docMetaData.setDocumentUri("file:" + xmlFile.getAbsolutePath()); } catch (Exception e) { // e.printStackTrace(); throw new CollectionException(e); } }
/* * Method that reads all serialized cases in the specified folder * param: path --> specifies the folder * returns a list of jcases * */ public List<JCas> read(String path) throws Exception { List<JCas> jCases = new ArrayList<JCas>(); System.out.println("--- READING ---"); @SuppressWarnings("deprecation") CollectionReader reader = CollectionReaderFactory.createReader( BinaryCasReader.class, ResourceCollectionReaderBase.PARAM_PATH, path, ResourceCollectionReaderBase.PARAM_PATTERNS, new String[] {ResourceCollectionReaderBase.INCLUDE_PREFIX + "*.bin"}); while (reader.hasNext()) { CAS cas = CasCreationUtils.createCas(createTypeSystemDescription(), null, null); reader.getNext(cas); System.out.println(cas.getJCas().getDocumentText()); jCases.add(cas.getJCas()); } return jCases; }
/** * TODO :: 1. construct the global word dictionary 2. keep the word frequency for each sentence * * <p>Creates two dictionaries queryDictionary and answerDictionary * * <p>queryDictionary is list of maps with key as the words in the question and value as the count * of the word in the question sentence. Similarly answerDictionary is list of maps with key as * the words in the answer and value as the count of the word in the answer sentence. */ @Override public void processCas(CAS aCas) throws ResourceProcessException { JCas jcas; try { jcas = aCas.getJCas(); } catch (CASException e) { throw new ResourceProcessException(e); } FSIterator it = jcas.getAnnotationIndex(Document.type).iterator(); if (it.hasNext()) { Document doc = (Document) it.next(); // Make sure that your previous annotators have populated this in CAS FSList fsTokenList = doc.getTokenList(); ArrayList<Token> tokenList = Utils.fromFSListToCollection(fsTokenList, Token.class); HashMap<String, Integer> myMap = new HashMap<String, Integer>(); HashMap<String, Integer> myMap2 = new HashMap<String, Integer>(); // if question then fill QuesqIdList, QuesrelList & queryDictionary if (doc.getRelevanceValue() == 99) { QuesqIdList.add(doc.getQueryID()); QuesrelList.add(doc.getRelevanceValue()); for (int k = 0; k < tokenList.size(); k++) { myMap.put(tokenList.get(k).getText(), tokenList.get(k).getFrequency()); } queryDictionary.add(myMap); } // if answer then fill AnsqIdList, AnsrelList & answerDictionary else { AnsqIdList.add(doc.getQueryID()); AnsrelList.add(doc.getRelevanceValue()); for (int k = 0; k < tokenList.size(); k++) { myMap2.put(tokenList.get(k).getText(), tokenList.get(k).getFrequency()); } answerDictionary.add(myMap2); if (1 == doc.getRelevanceValue()) { GoldAnswerStringList.put(doc.getQueryID(), doc.getText()); } } // Do something useful here /*for(int i=0;i<tokenList.size();i++) System.out.print(tokenList.get(i).getText().toString()+"=>" + tokenList.get(i).getFrequency()+"\t"); System.out.println();*/ } }
public void getNext(CAS aCAS) throws IOException, CollectionException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new CollectionException(e); } fillJCas(jcas); // give an indicator that a file has been processed System.err.print("."); }
@Override public void getNext(CAS aCAS) throws IOException, CollectionException { // TODO Auto-generated method stub JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new CollectionException(e); } // open input stream to file String sentence = mSentences.get(mCurrentIndex++); // put document in CAS jcas.setDocumentText(sentence); }
/** * Gets the next sentence from the input file. * * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) */ @Override public void getNext(CAS aCas) throws IOException, CollectionException { JCas jcas = null; try { jcas = aCas.getJCas(); } catch (CASException e) { throw new CollectionException(e); } String lineString = mBufferdReader.readLine().trim(); String sentenceId = lineString.substring(0, lineString.indexOf(" ")); String sentenceText = lineString.substring(lineString.indexOf(" ")); jcas.setDocumentText(sentenceText); Sentence sentence = new Sentence(jcas); sentence.setSentenceId(sentenceId); sentence.addToIndexes(); }
@Override public void getNext(CAS aCAS) throws IOException, CollectionException { super.getNext(aCAS); JCas jcas; try { jcas = aCAS.getJCas(); // consider a tweet to be a sentence Sentence sentenceAnno = new Sentence(jcas); sentenceAnno.setBegin(0); sentenceAnno.setEnd(jcas.getDocumentText().length()); sentenceAnno.addToIndexes(); } catch (CASException e) { throw new CollectionException(); } TextClassificationOutcome outcome = new TextClassificationOutcome(jcas); outcome.setOutcome(getTextClassificationOutcome(jcas)); outcome.addToIndexes(); }
/** * Processes the CAS which was populated by the TextAnalysisEngines. <br> * In this case, the CAS is converted to XMI and written into the output file . * * @param aCAS a CAS which has been populated by the TAEs * @throws ResourceProcessException if there is an error in processing the Resource * @see org.apache.uima.collection.base_cpm.CasObjectProcessor#processCas(org.apache.uima.cas.CAS) */ public void processCas(CAS aCAS) throws ResourceProcessException { String modelFileName = null; JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new ResourceProcessException(e); } // retrieve the filename of the input file from the CAS FSIterator it = jcas.getAnnotationIndex(SourceDocumentInformation.type).iterator(); File outFile = null; if (it.hasNext()) { SourceDocumentInformation fileLoc = (SourceDocumentInformation) it.next(); File inFile; try { inFile = new File(new URL(fileLoc.getUri()).getPath()); String outFileName = inFile.getName(); if (fileLoc.getOffsetInSource() > 0) { outFileName += ("_" + fileLoc.getOffsetInSource()); } outFileName += ".xmi"; outFile = new File(mOutputDir, outFileName); modelFileName = mOutputDir.getAbsolutePath() + "/" + inFile.getName() + ".ecore"; } catch (MalformedURLException e1) { // invalid URL, use default processing below } } if (outFile == null) { outFile = new File(mOutputDir, "doc" + mDocNum++ + ".xmi"); } // serialize XCAS and write to output file try { writeXmi(jcas.getCas(), outFile, modelFileName); } catch (IOException e) { throw new ResourceProcessException(e); } catch (SAXException e) { throw new ResourceProcessException(e); } }
/** @see com.ibm.uima.collection.CollectionReader#getNext(com.ibm.uima.cas.CAS) */ public void getNext(CAS aCAS) throws IOException, CollectionException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new CollectionException(e); } String name = fileIterator.next().getAbsolutePath(); jcas.setDocumentText(ReadWriteTextFileWithEncoding.read(name, "UTF-8")); numberOfFilesProcessed++; try { name = filenameToIDTranslator.cleanItUp(name); } catch (StringCleanerException e) { // TODO Auto-generated catch block e.printStackTrace(); } StringArray s = new StringArray(jcas, 1); s.set(0, filenameToIDTranslator.getIdType() + name); ISI_UIMA_Util.setDocumentSecondaryIDs(jcas, s); }
public void entityProcessComplete(CAS aCas, EntityProcessStatus aStatus) { if (aStatus != null) { if (aStatus.isException()) { System.err.println("Error on process CAS call to remote service:"); List<Exception> exceptions = aStatus.getExceptions(); for (int i = 0; i < exceptions.size(); i++) { ((Throwable) exceptions.get(i)).printStackTrace(); } } try { JCas cas = aCas.getJCas(); for(Token token : JCasUtil.select(cas, Token.class)) { System.out.println(token.getCoveredText() + " " + token.getPos().getPosValue()); } } catch (CASException e) { e.printStackTrace(); } } }
@Override public void getNext(CAS cas) throws IOException, CollectionException { this.cumulatedLength += currentDoc.getText().length(); logger.info( "[Stream {}] Processing document {}: {} (total length processed: {})", this.streamName, this.mCurrentIndex, this.currentDoc.getUri(), this.cumulatedLength); SourceDocumentInformation sdi; try { sdi = new SourceDocumentInformation(cas.getJCas()); sdi.setUri(currentDoc.getUri()); cas.setDocumentLanguage(mLanguage.getCode()); cas.setDocumentText(currentDoc.getText()); sdi.setDocumentSize(currentDoc.getText().length()); sdi.setCumulatedDocumentSize(this.cumulatedLength); sdi.setBegin(0); sdi.setEnd(currentDoc.getText().length()); sdi.setOffsetInSource(0); sdi.setDocumentIndex(mCurrentIndex); /* * Cannot be known in case of streaming */ sdi.setCorpusSize(-1); sdi.setNbDocuments(-1); // Cannot know if this is the last sdi.setLastSegment(false); sdi.addToIndexes(); this.mCurrentIndex++; } catch (CASException e) { throw new CollectionException(e); } }
@Override public void process(CAS cas) throws AnalysisEngineProcessException { JCas textJCas; try { textJCas = cas.getJCas(); setStream(textJCas); } catch (CASException e1) { throw new AnalysisEngineProcessException(e1); } catch (final IOException e2) { throw new AnalysisEngineProcessException(e2); } final FSIterator<Annotation> annotationIt = SemanticAnnotation.getIterator(textJCas); while (annotationIt.hasNext()) { final SemanticAnnotation ann = (SemanticAnnotation) annotationIt.next(); final String text = replaceNewlines ? StringUtils.join(' ', ann.getCoveredText().split(LINEBREAK)) : ann.getCoveredText(); try { write(ann.getNamespace()); write(fieldSeparator); write(ann.getIdentifier()); write(fieldSeparator); write(ann.getOffset().toString()); write(fieldSeparator); write(text); write(LINEBREAK); } catch (final IOException e) { throw new AnalysisEngineProcessException(e); } } try { unsetStream(); } catch (final IOException e) { throw new AnalysisEngineProcessException(e); } }
/** * CasConsumer would use tags and features to write output file, evaluate and print precision, * recall and F-1 measure. * * @param arg0 * @throws ResourceProcessException */ @Override public void processCas(CAS arg0) throws ResourceProcessException { /** convert type of arg0 */ JCas jcas = null; try { jcas = arg0.getJCas(); } catch (CASException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } // TODO Auto-generated method stub FSIterator<Annotation> ite = jcas.getAnnotationIndex(WordTag.type).iterator(); while (ite.hasNext()) { /** collect features */ String id = ((WordTag) ite.get()).getId(); int begin = ((WordTag) ite.get()).getBegin0(); int end = ((WordTag) ite.get()).getEnd0(); String name = ((WordTag) ite.get()).getName(); /** organize string for output */ report.append(id); report.append("|"); report.append(begin); report.append(" "); report.append(end); report.append("|"); report.append(name); report.append("\n"); /** count the length of output string */ count++; ite.next(); } result = report.toString(); File sampleOut = new File("src/main/resources/data/sample.out"); try { testRecall = FileUtils.file2String(sampleOut); } catch (IOException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } /** split strings from file into sentences */ String[] resultSplit = result.split("\n"); String[] recallSplit = testRecall.split("\n"); PrecisionRecallCalculator(recallSplit, resultSplit); /** write the output file to the project root */ String path = "hw1-longh.out"; File dirFile = new File(path); /** make sure no conflict */ if (dirFile.exists()) { dirFile.delete(); } try { /** write file */ BufferedWriter bw1 = new BufferedWriter(new FileWriter(path, true)); bw1.write(report.toString()); bw1.flush(); bw1.close(); } catch (IOException e) { e.printStackTrace(); } }
/* (non-Javadoc) * @see org.apache.uima.collection.base_cpm.CasObjectProcessor#processCas(org.apache.uima.cas.CAS) */ public void processCas(CAS aCAS) throws ResourceProcessException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { logger.log(Level.SEVERE, e.getMessage()); throw new ResourceProcessException(e); } TweetAnnotation tweetAnn = (TweetAnnotation) jcas.getAnnotationIndex(TweetAnnotation.type).iterator().next(); OMTweet answerTweet = evalCorpusReader.next(); if (!answerTweet.getId().equals(tweetAnn.getId())) { logger.log( Level.SEVERE, "target corpus and evaluation corpus don't match to each other - " + answerTweet.getId() + ", " + tweetAnn.getId()); throw new ResourceProcessException(); } String[] entity = extractEntityTags(answerTweet.getText()); String classified = null; String prevClassified = null; StringBuffer sb = new StringBuffer(); try { sb.append("\n["); sb.append(answerTweet.getPolarityString()); sb.append("=>"); sb.append(tweetAnn.getPolarity()); sb.append("] "); sb.append(tweetAnn.getCoveredText()); sb.append('\n'); FSIterator<Annotation> tokenAnnIter = jcas.getAnnotationIndex(TokenAnnotation.type).iterator(); TokenAnnotation tokenAnn = null; int i = 0; int prevClassifiedIdx = labelNoneIdx; int prevAnswerIdx = labelNoneIdx; String classifiedEntityStr = ""; String answerEntityStr = ""; while (tokenAnnIter.hasNext()) { tokenAnn = (TokenAnnotation) tokenAnnIter.next(); classified = tokenAnn.getEntityLabel(); String answer = entity[i]; boolean correct = false; if (classified.equals(answer)) { correct = true; } int classifiedIdx = 0; int answerIdx = 0; try { answerIdx = map.get(answer); } catch (Exception e) { logger.log( Level.SEVERE, "wrong annotation on the evaluation corpus - tweet id: " + answerTweet.getId() + ", answerTag=" + answer); logger.log(Level.SEVERE, e.getMessage()); answerIdx = map.get(labelNone); } try { classifiedIdx = map.get(classified); } catch (Exception e) { logger.log( Level.SEVERE, "wrong annotation from the NER - tweet id: " + answerTweet.getId() + ", classifiedTag=" + classified); logger.log(Level.SEVERE, e.getMessage()); classifiedIdx = map.get(labelNone); } stat[classifiedIdx][0]++; stat[answerIdx][1]++; if (correct) { stat[classifiedIdx][2]++; } if (classifiedIdx != labelNoneIdx) { if (classifiedIdx / 3 != prevClassifiedIdx / 3) { classifiedEntityCnt[classifiedIdx / 3]++; if (prevClassifiedIdx != labelNoneIdx) { sb.append('\t'); sb.append(classifiedEntityStr); sb.append(" -> "); sb.append(prevClassified.substring(0, prevClassified.lastIndexOf('_'))); sb.append('\n'); } classifiedEntityStr = tokenAnn.getCoveredText(); } else { classifiedEntityStr += " " + tokenAnn.getCoveredText(); } } else if (prevClassifiedIdx != labelNoneIdx) { sb.append('\t'); sb.append(classifiedEntityStr); sb.append(" -> "); sb.append(prevClassified.substring(0, prevClassified.lastIndexOf('_'))); sb.append('\n'); classifiedEntityStr = ""; } prevClassifiedIdx = classifiedIdx; if (answerIdx != labelNoneIdx) { if (answerIdx / 3 != prevAnswerIdx / 3) { answerEntityCnt[answerIdx / 3]++; answerEntityStr = tokenAnn.getCoveredText(); } else { answerEntityStr += " " + tokenAnn.getCoveredText(); } } else if (prevAnswerIdx != labelNoneIdx) { answerEntityStr = ""; } prevAnswerIdx = answerIdx; prevClassified = classified; i++; } if (prevClassifiedIdx != labelNoneIdx) { sb.append('\t'); sb.append(classifiedEntityStr); sb.append(" -> "); sb.append(prevClassified.substring(0, prevClassified.lastIndexOf('_'))); sb.append('\n'); } // senti String answerSenti = answerTweet.getPolarityString(); boolean correct = false; String classifiedSenti = tweetAnn.getPolarity(); if (classifiedSenti.equals(senti)) { correct = true; } int classifiedIdx = sentiIdx(classifiedSenti); int answerIdx = sentiIdx(answerSenti); senti[classifiedIdx][0]++; senti[answerIdx][1]++; if (classifiedIdx == answerIdx) { correct = true; } if (correct) { senti[classifiedIdx][2]++; } cnt++; logger.log(Level.INFO, sb.toString()); } catch (CASRuntimeException e) { throw new ResourceProcessException(e); } }
/** @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) */ public void getNext(CAS aCAS) throws IOException, CollectionException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new CollectionException(e); } // put document in CAS jcas.setDocumentText(mNextText.getText()); for (Paragraph para : mNextText.getParagraphs()) { GoldenParagraph p = new GoldenParagraph(jcas); for (SentenceEx sent : para.getSentences()) { GoldenSentence s = new GoldenSentence(jcas); s.setId(sent.getId()); s.setBegin(sent.getStart()); s.setEnd(sent.getEnd()); List<GrEr> grers = sent.getGrammarErrors(); if (grers.size() > 0) { FSArray fsarr = new FSArray(jcas, grers.size()); for (int j = 0; j < grers.size(); j++) { GoldenGrammarError ge = new GoldenGrammarError(jcas); ge.setBegin(grers.get(j).getStart()); ge.setEnd(grers.get(j).getEnd()); ge.setCategory(grers.get(j).getCat()); ge.setError(grers.get(j).getErr()); ge.setReplace(grers.get(j).getRep()); ge.addToIndexes(); fsarr.set(j, ge); } s.setGoldenGrammarErrors(fsarr); } s.addToIndexes(); } p.setId(para.getId()); p.setBegin(para.getStart()); p.setEnd(para.getEnd()); p.addToIndexes(); } // set language if it was explicitly specified as a configuration parameter if (mLanguage != null) { ((DocumentAnnotation) jcas.getDocumentAnnotationFs()).setLanguage(mLanguage); } // Also store location of source document in CAS. This information is // critical // if CAS Consumers will need to know where the original document contents // are located. // For example, the Semantic Search CAS Indexer writes this information into // the // search index that it creates, which allows applications that use the // search index to // locate the documents that satisfy their semantic queries. // SourceDocumentInformation srcDocInfo = new // SourceDocumentInformation(jcas); // srcDocInfo.setUri(file.getAbsoluteFile().toURL().toString()); // srcDocInfo.setOffsetInSource(0); // srcDocInfo.setDocumentSize((int) file.length()); // srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size()); // srcDocInfo.addToIndexes(); mCurrentText++; mNextText = mMultiReader.read(); }
/** @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) */ public void getNext(CAS aCAS) throws IOException, CollectionException { JCas jcas; try { jcas = aCAS.getJCas(); } catch (CASException e) { throw new CollectionException(e); } // open input stream to file File file = (File) mFiles.get(mCurrentIndex++); System.out.println("Reading file: " + file.getAbsolutePath()); // read zipped file String text; BufferedReader reader = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file)))); StringBuffer textBuffer = new StringBuffer(); Integer currindex = -1; while (reader.ready()) { PubmedDocument pmdoc = new PubmedDocument(jcas); String s = reader.readLine(); // System.out.println(s); // split line into pmid and text String[] two = new String[2]; two = splitFirst(s, "\t"); pmdoc.setPmid(two[0]); String annot = new String(two[1]); // append text textBuffer.append(annot + "\n"); // pmdoc.setBegin(currindex + two[0].length() + 1); pmdoc.setBegin(currindex + 1); Integer len = annot.length(); currindex = currindex + len + 1; pmdoc.setEnd(currindex); // System.out.println( "pmid: "+two[0] + "\t" + // "[begin/end]:"+ pmdoc.getBegin() + "/" + pmdoc.getEnd() + "\t" + // "annot:" + annot // ); // System.out.println(annot.substring(pmdoc.getBegin(), pmdoc.getEnd())); pmdoc.addToIndexes(); } text = textBuffer.toString(); // System.out.println(text); // old File to String Method // String text = FileUtils.file2String(file, mEncoding); // put document in CAS jcas.setDocumentText(text); // set language if it was explicitly specified as a configuration parameter if (mLanguage != null) { ((DocumentAnnotation) jcas.getDocumentAnnotationFs()).setLanguage(mLanguage); } // Also store location of source document in CAS. This information is critical // if CAS Consumers will need to know where the original document contents are located. // For example, the Semantic Search CAS Indexer writes this information into the // search index that it creates, which allows applications that use the search index to // locate the documents that satisfy their semantic queries. SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas); srcDocInfo.setUri(file.getAbsoluteFile().toURI().toString()); srcDocInfo.setOffsetInSource(0); srcDocInfo.setDocumentSize((int) file.length()); srcDocInfo.setLastSegment(mCurrentIndex == mFiles.size()); srcDocInfo.setBegin(0); srcDocInfo.setEnd(currindex); srcDocInfo.addToIndexes(); }