Пример #1
0
 void openNextFile() throws NoMoreDataException, IOException {
   close();
   currPathType = null;
   while (true) {
     if (nextFile >= inputFiles.size()) {
       // exhausted files, start a new round, unless forever set to false.
       if (!forever) {
         throw new NoMoreDataException();
       }
       nextFile = 0;
       iteration++;
     }
     File f = inputFiles.get(nextFile++);
     if (verbose) {
       System.out.println("opening: " + f + " length: " + f.length());
     }
     try {
       InputStream inputStream =
           StreamUtils.inputStream(
               f); // support either gzip, bzip2, or regular text file, by extension
       reader =
           new BufferedReader(
               new InputStreamReader(inputStream, encoding), StreamUtils.BUFFER_SIZE);
       currPathType = TrecDocParser.pathType(f);
       return;
     } catch (Exception e) {
       if (verbose) {
         System.out.println(
             "Skipping 'bad' file " + f.getAbsolutePath() + " due to " + e.getMessage());
         continue;
       }
       throw new NoMoreDataException();
     }
   }
 }
Пример #2
0
  @Override
  public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
    String name = null;
    StringBuilder docBuf = getDocBuffer();
    ParsePathType parsedPathType;

    // protect reading from the TREC files by multiple threads. The rest of the
    // method, i.e., parsing the content and returning the DocData can run unprotected.
    synchronized (lock) {
      if (reader == null) {
        openNextFile();
      }

      // 1. skip until doc start - required for all TREC formats
      docBuf.setLength(0);
      read(docBuf, DOC, false, false);

      // save parsedFile for passing trecDataParser after the sync block, in
      // case another thread will open another file in between.
      parsedPathType = currPathType;

      // 2. name - required for all TREC formats
      docBuf.setLength(0);
      read(docBuf, DOCNO, true, false);
      name =
          docBuf
              .substring(DOCNO.length(), docBuf.indexOf(TERMINATING_DOCNO, DOCNO.length()))
              .trim();

      if (!excludeDocnameIteration) {
        name = name + "_" + iteration;
      }

      // 3. read all until end of doc
      docBuf.setLength(0);
      read(docBuf, TERMINATING_DOC, false, true);
    }

    // count char length of text to be parsed (may be larger than the resulted plain doc body text).
    addBytes(docBuf.length());

    // This code segment relies on HtmlParser being thread safe. When we get
    // here, everything else is already private to that thread, so we're safe.
    try {
      docData = trecDocParser.parse(docData, name, this, docBuf, parsedPathType);
      addDoc();
    } catch (InterruptedException ie) {
      throw new ThreadInterruptedException(ie);
    }

    return docData;
  }