@Override public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { final String line; final int myID; synchronized (this) { line = reader.readLine(); if (line == null) { if (!forever) { throw new NoMoreDataException(); } // Reset the file openFile(); return getNextDocData(docData); } if (docDataLineReader == null) { // first line ever, one time initialization, docDataLineReader = createDocDataLineReader(line); if (skipHeaderLine) { return getNextDocData(docData); } } // increment IDS only once... myID = readCount++; } // The date String was written in the format of DateTools.dateToString. docData.clear(); docData.setID(myID); docDataLineReader.parseLine(docData, line); return docData; }
@Override public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { String[] tuple = parser.next(); docData.clear(); docData.setName(tuple[ID]); docData.setBody(tuple[BODY]); docData.setDate(tuple[DATE]); docData.setTitle(tuple[TITLE]); return docData; }
@Override public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { File f = null; String name = null; synchronized (this) { if (!inputFiles.hasNext()) { // exhausted files, start a new round, unless forever set to false. if (!forever) { throw new NoMoreDataException(); } inputFiles = new Iterator(dataDir); iteration++; } f = inputFiles.next(); // System.err.println(f); name = f.getCanonicalPath() + "_" + iteration; } BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8)); String line = null; // First line is the date, 3rd is the title, rest is body String dateStr = reader.readLine(); reader.readLine(); // skip an empty line String title = reader.readLine(); reader.readLine(); // skip an empty line StringBuilder bodyBuf = new StringBuilder(1024); while ((line = reader.readLine()) != null) { bodyBuf.append(line).append(' '); } reader.close(); addBytes(f.length()); Date date = parseDate(dateStr); docData.clear(); docData.setName(name); docData.setBody(bodyBuf.toString()); docData.setTitle(title); docData.setDate(date); return docData; }