@Override public void parseLine(DocData docData, String line) { int k1 = 0; int k2 = line.indexOf(WriteLineDocTask.SEP, k1); if (k2 < 0) { throw new RuntimeException( "line: [" + line + "] is in an invalid format (missing: separator title::date)!"); } docData.setTitle(line.substring(k1, k2)); k1 = k2 + 1; k2 = line.indexOf(WriteLineDocTask.SEP, k1); if (k2 < 0) { throw new RuntimeException( "line: [" + line + "] is in an invalid format (missing: separator date::body)!"); } docData.setDate(line.substring(k1, k2)); k1 = k2 + 1; k2 = line.indexOf(WriteLineDocTask.SEP, k1); if (k2 >= 0) { throw new RuntimeException( "line: [" + line + "] is in an invalid format (too many separators)!"); } // last one docData.setBody(line.substring(k1)); }
@Override public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { final String line; final int myID; synchronized (this) { line = reader.readLine(); if (line == null) { if (!forever) { throw new NoMoreDataException(); } // Reset the file openFile(); return getNextDocData(docData); } if (docDataLineReader == null) { // first line ever, one time initialization, docDataLineReader = createDocDataLineReader(line); if (skipHeaderLine) { return getNextDocData(docData); } } // increment IDS only once... myID = readCount++; } // The date String was written in the format of DateTools.dateToString. docData.clear(); docData.setID(myID); docDataLineReader.parseLine(docData, line); return docData; }
/** * Same as {@link #makeDocument()}, only this method creates a document of the given size input by * <code>size</code>. */ public Document makeDocument(int size) throws Exception { LeftOver lvr = leftovr.get(); if (lvr == null || lvr.docdata == null || lvr.docdata.getBody() == null || lvr.docdata.getBody().length() == 0) { resetLeftovers(); } DocData docData = getDocState().docData; DocData dd = (lvr == null ? source.getNextDocData(docData) : lvr.docdata); int cnt = (lvr == null ? 0 : lvr.cnt); while (dd.getBody() == null || dd.getBody().length() < size) { DocData dd2 = dd; dd = source.getNextDocData(new DocData()); cnt = 0; dd.setBody(dd2.getBody() + dd.getBody()); } Document doc = createDocument(dd, size, cnt); if (dd.getBody() == null || dd.getBody().length() == 0) { resetLeftovers(); } else { if (lvr == null) { lvr = new LeftOver(); leftovr.set(lvr); } lvr.docdata = dd; lvr.cnt = ++cnt; } return doc; }
@Override public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { String[] tuple = parser.next(); docData.clear(); docData.setName(tuple[ID]); docData.setBody(tuple[BODY]); docData.setDate(tuple[DATE]); docData.setTitle(tuple[TITLE]); return docData; }
@Override public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { File f = null; String name = null; synchronized (this) { if (!inputFiles.hasNext()) { // exhausted files, start a new round, unless forever set to false. if (!forever) { throw new NoMoreDataException(); } inputFiles = new Iterator(dataDir); iteration++; } f = inputFiles.next(); // System.err.println(f); name = f.getCanonicalPath() + "_" + iteration; } BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8)); String line = null; // First line is the date, 3rd is the title, rest is body String dateStr = reader.readLine(); reader.readLine(); // skip an empty line String title = reader.readLine(); reader.readLine(); // skip an empty line StringBuilder bodyBuf = new StringBuilder(1024); while ((line = reader.readLine()) != null) { bodyBuf.append(line).append(' '); } reader.close(); addBytes(f.length()); Date date = parseDate(dateStr); docData.clear(); docData.setName(name); docData.setBody(bodyBuf.toString()); docData.setTitle(title); docData.setDate(date); return docData; }
public double getFirstPos(DocData Doc, QueryData query) { // TODO Auto-generated method stub // int length = Doc.getBodyLength(); double sum = 0; int termCount = 0; for (ArrayList<Integer> li : Doc.getbodyHitPos()) { for (int i : li) { if (i == 1) sum += 1 / Math.log(1.2); else sum += 1 / Math.log(i); termCount++; } } if (sum == 0) return 0; else return sum / termCount; }
private void setDocDataField(DocData docData, int position, String text) { switch (posToF[position]) { case NAME: docData.setName(text); break; case TITLE: docData.setTitle(text); break; case DATE: docData.setDate(text); break; case BODY: docData.setBody(text); break; case PROP: Properties p = docData.getProps(); if (p == null) { p = new Properties(); docData.setProps(p); } p.setProperty(header[position], text); break; } }
// create a doc // use only part of the body, modify it to keep the rest (or use all if size==0). // reset the docdata properties so they are not added more than once. private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException { final DocState ds = getDocState(); final Document doc = reuseFields ? ds.doc : new Document(); doc.getFields().clear(); // Set ID_FIELD Field idField = ds.getField(ID_FIELD, storeVal, Index.NOT_ANALYZED_NO_NORMS, termVecVal); int id; if (r != null) { id = r.nextInt(updateDocIDLimit); } else { id = docData.getID(); if (id == -1) { id = numDocsCreated.getAndIncrement(); } } idField.setValue(Integer.toString(id)); doc.add(idField); // Set NAME_FIELD String name = docData.getName(); if (name == null) name = ""; name = cnt < 0 ? name : name + "_" + cnt; Field nameField = ds.getField(NAME_FIELD, storeVal, indexVal, termVecVal); nameField.setValue(name); doc.add(nameField); // Set DATE_FIELD DateUtil util = dateParsers.get(); if (util == null) { util = new DateUtil(); dateParsers.set(util); } Date date = null; String dateString = docData.getDate(); if (dateString != null) { util.pos.setIndex(0); date = util.parser.parse(dateString, util.pos); // System.out.println(dateString + " parsed to " + date); } else { dateString = ""; } Field dateStringField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal); dateStringField.setValue(dateString); doc.add(dateStringField); if (date == null) { // just set to right now date = new Date(); } NumericField dateField = ds.getNumericField(DATE_MSEC_FIELD); dateField.setLongValue(date.getTime()); doc.add(dateField); util.cal.setTime(date); final int sec = util.cal.get(Calendar.HOUR_OF_DAY) * 3600 + util.cal.get(Calendar.MINUTE) * 60 + util.cal.get(Calendar.SECOND); NumericField timeSecField = ds.getNumericField(TIME_SEC_FIELD); timeSecField.setIntValue(sec); doc.add(timeSecField); // Set TITLE_FIELD String title = docData.getTitle(); Field titleField = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal); titleField.setValue(title == null ? "" : title); doc.add(titleField); String body = docData.getBody(); if (body != null && body.length() > 0) { String bdy; if (size <= 0 || size >= body.length()) { bdy = body; // use all docData.setBody(""); // nothing left } else { // attempt not to break words - if whitespace found within next 20 chars... for (int n = size - 1; n < size + 20 && n < body.length(); n++) { if (Character.isWhitespace(body.charAt(n))) { size = n; break; } } bdy = body.substring(0, size); // use part docData.setBody(body.substring(size)); // some left } Field bodyField = ds.getField(BODY_FIELD, bodyStoreVal, bodyIndexVal, termVecVal); bodyField.setValue(bdy); doc.add(bodyField); if (storeBytes) { Field bytesField = ds.getField(BYTES_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO); bytesField.setValue(bdy.getBytes("UTF-8")); doc.add(bytesField); } } if (indexProperties) { Properties props = docData.getProps(); if (props != null) { for (final Map.Entry<Object, Object> entry : props.entrySet()) { Field f = ds.getField((String) entry.getKey(), storeVal, indexVal, termVecVal); f.setValue((String) entry.getValue()); doc.add(f); } docData.setProps(null); } } // System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n=========="); return doc; }