void openNextFile() throws NoMoreDataException, IOException { close(); currPathType = null; while (true) { if (nextFile >= inputFiles.size()) { // exhausted files, start a new round, unless forever set to false. if (!forever) { throw new NoMoreDataException(); } nextFile = 0; iteration++; } File f = inputFiles.get(nextFile++); if (verbose) { System.out.println("opening: " + f + " length: " + f.length()); } try { InputStream inputStream = StreamUtils.inputStream( f); // support either gzip, bzip2, or regular text file, by extension reader = new BufferedReader( new InputStreamReader(inputStream, encoding), StreamUtils.BUFFER_SIZE); currPathType = TrecDocParser.pathType(f); return; } catch (Exception e) { if (verbose) { System.out.println( "Skipping 'bad' file " + f.getAbsolutePath() + " due to " + e.getMessage()); continue; } throw new NoMoreDataException(); } } }
@Override public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { String name = null; StringBuilder docBuf = getDocBuffer(); ParsePathType parsedPathType; // protect reading from the TREC files by multiple threads. The rest of the // method, i.e., parsing the content and returning the DocData can run unprotected. synchronized (lock) { if (reader == null) { openNextFile(); } // 1. skip until doc start - required for all TREC formats docBuf.setLength(0); read(docBuf, DOC, false, false); // save parsedFile for passing trecDataParser after the sync block, in // case another thread will open another file in between. parsedPathType = currPathType; // 2. name - required for all TREC formats docBuf.setLength(0); read(docBuf, DOCNO, true, false); name = docBuf .substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO, DOCNO.length())) .trim(); if (!excludeDocnameIteration) { name = name + "_" + iteration; } // 3. read all until end of doc docBuf.setLength(0); read(docBuf, TERMINATING_DOC, false, true); } // count char length of text to be parsed (may be larger than the resulted plain doc body text). addBytes(docBuf.length()); // This code segment relies on HtmlParser being thread safe. When we get // here, everything else is already private to that thread, so we're safe. try { docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType); addDoc(); } catch (InterruptedException ie) { throw new ThreadInterruptedException(ie); } return docData; }