Пример #1
0
 @Override
 public void parseLine(DocData docData, String line) {
   int k1 = 0;
   int k2 = line.indexOf(WriteLineDocTask.SEP, k1);
   if (k2 < 0) {
     throw new RuntimeException(
         "line: [" + line + "] is in an invalid format (missing: separator title::date)!");
   }
   docData.setTitle(line.substring(k1, k2));
   k1 = k2 + 1;
   k2 = line.indexOf(WriteLineDocTask.SEP, k1);
   if (k2 < 0) {
     throw new RuntimeException(
         "line: [" + line + "] is in an invalid format (missing: separator date::body)!");
   }
   docData.setDate(line.substring(k1, k2));
   k1 = k2 + 1;
   k2 = line.indexOf(WriteLineDocTask.SEP, k1);
   if (k2 >= 0) {
     throw new RuntimeException(
         "line: [" + line + "] is in an invalid format (too many separators)!");
   }
   // last one
   docData.setBody(line.substring(k1));
 }
Пример #2
0
  @Override
  public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
    final String line;
    final int myID;

    synchronized (this) {
      line = reader.readLine();
      if (line == null) {
        if (!forever) {
          throw new NoMoreDataException();
        }
        // Reset the file
        openFile();
        return getNextDocData(docData);
      }
      if (docDataLineReader == null) { // first line ever, one time initialization,
        docDataLineReader = createDocDataLineReader(line);
        if (skipHeaderLine) {
          return getNextDocData(docData);
        }
      }
      // increment IDS only once...
      myID = readCount++;
    }

    // The date String was written in the format of DateTools.dateToString.
    docData.clear();
    docData.setID(myID);
    docDataLineReader.parseLine(docData, line);
    return docData;
  }
Пример #3
0
 /**
  * Same as {@link #makeDocument()}, only this method creates a document of the given size input by
  * <code>size</code>.
  */
 public Document makeDocument(int size) throws Exception {
   LeftOver lvr = leftovr.get();
   if (lvr == null
       || lvr.docdata == null
       || lvr.docdata.getBody() == null
       || lvr.docdata.getBody().length() == 0) {
     resetLeftovers();
   }
   DocData docData = getDocState().docData;
   DocData dd = (lvr == null ? source.getNextDocData(docData) : lvr.docdata);
   int cnt = (lvr == null ? 0 : lvr.cnt);
   while (dd.getBody() == null || dd.getBody().length() < size) {
     DocData dd2 = dd;
     dd = source.getNextDocData(new DocData());
     cnt = 0;
     dd.setBody(dd2.getBody() + dd.getBody());
   }
   Document doc = createDocument(dd, size, cnt);
   if (dd.getBody() == null || dd.getBody().length() == 0) {
     resetLeftovers();
   } else {
     if (lvr == null) {
       lvr = new LeftOver();
       leftovr.set(lvr);
     }
     lvr.docdata = dd;
     lvr.cnt = ++cnt;
   }
   return doc;
 }
 @Override
 public synchronized DocData getNextDocData(DocData docData)
     throws NoMoreDataException, IOException {
   String[] tuple = parser.next();
   docData.clear();
   docData.setName(tuple[ID]);
   docData.setBody(tuple[BODY]);
   docData.setDate(tuple[DATE]);
   docData.setTitle(tuple[TITLE]);
   return docData;
 }
  @Override
  public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
    File f = null;
    String name = null;
    synchronized (this) {
      if (!inputFiles.hasNext()) {
        // exhausted files, start a new round, unless forever set to false.
        if (!forever) {
          throw new NoMoreDataException();
        }
        inputFiles = new Iterator(dataDir);
        iteration++;
      }
      f = inputFiles.next();
      // System.err.println(f);
      name = f.getCanonicalPath() + "_" + iteration;
    }

    BufferedReader reader =
        new BufferedReader(new InputStreamReader(new FileInputStream(f), IOUtils.CHARSET_UTF_8));
    String line = null;
    // First line is the date, 3rd is the title, rest is body
    String dateStr = reader.readLine();
    reader.readLine(); // skip an empty line
    String title = reader.readLine();
    reader.readLine(); // skip an empty line
    StringBuilder bodyBuf = new StringBuilder(1024);
    while ((line = reader.readLine()) != null) {
      bodyBuf.append(line).append(' ');
    }
    reader.close();
    addBytes(f.length());

    Date date = parseDate(dateStr);

    docData.clear();
    docData.setName(name);
    docData.setBody(bodyBuf.toString());
    docData.setTitle(title);
    docData.setDate(date);
    return docData;
  }
  public double getFirstPos(DocData Doc, QueryData query) {
    // TODO Auto-generated method stub

    // int length = Doc.getBodyLength();

    double sum = 0;
    int termCount = 0;

    for (ArrayList<Integer> li : Doc.getbodyHitPos()) {
      for (int i : li) {
        if (i == 1) sum += 1 / Math.log(1.2);
        else sum += 1 / Math.log(i);
        termCount++;
      }
    }

    if (sum == 0) return 0;
    else return sum / termCount;
  }
Пример #7
0
 private void setDocDataField(DocData docData, int position, String text) {
   switch (posToF[position]) {
     case NAME:
       docData.setName(text);
       break;
     case TITLE:
       docData.setTitle(text);
       break;
     case DATE:
       docData.setDate(text);
       break;
     case BODY:
       docData.setBody(text);
       break;
     case PROP:
       Properties p = docData.getProps();
       if (p == null) {
         p = new Properties();
         docData.setProps(p);
       }
       p.setProperty(header[position], text);
       break;
   }
 }
Пример #8
0
  // create a doc
  // use only part of the body, modify it to keep the rest (or use all if size==0).
  // reset the docdata properties so they are not added more than once.
  private Document createDocument(DocData docData, int size, int cnt)
      throws UnsupportedEncodingException {

    final DocState ds = getDocState();
    final Document doc = reuseFields ? ds.doc : new Document();
    doc.getFields().clear();

    // Set ID_FIELD
    Field idField = ds.getField(ID_FIELD, storeVal, Index.NOT_ANALYZED_NO_NORMS, termVecVal);
    int id;
    if (r != null) {
      id = r.nextInt(updateDocIDLimit);
    } else {
      id = docData.getID();
      if (id == -1) {
        id = numDocsCreated.getAndIncrement();
      }
    }
    idField.setValue(Integer.toString(id));
    doc.add(idField);

    // Set NAME_FIELD
    String name = docData.getName();
    if (name == null) name = "";
    name = cnt < 0 ? name : name + "_" + cnt;
    Field nameField = ds.getField(NAME_FIELD, storeVal, indexVal, termVecVal);
    nameField.setValue(name);
    doc.add(nameField);

    // Set DATE_FIELD
    DateUtil util = dateParsers.get();
    if (util == null) {
      util = new DateUtil();
      dateParsers.set(util);
    }
    Date date = null;
    String dateString = docData.getDate();
    if (dateString != null) {
      util.pos.setIndex(0);
      date = util.parser.parse(dateString, util.pos);
      // System.out.println(dateString + " parsed to " + date);
    } else {
      dateString = "";
    }
    Field dateStringField = ds.getField(DATE_FIELD, storeVal, indexVal, termVecVal);
    dateStringField.setValue(dateString);
    doc.add(dateStringField);

    if (date == null) {
      // just set to right now
      date = new Date();
    }

    NumericField dateField = ds.getNumericField(DATE_MSEC_FIELD);
    dateField.setLongValue(date.getTime());
    doc.add(dateField);

    util.cal.setTime(date);
    final int sec =
        util.cal.get(Calendar.HOUR_OF_DAY) * 3600
            + util.cal.get(Calendar.MINUTE) * 60
            + util.cal.get(Calendar.SECOND);

    NumericField timeSecField = ds.getNumericField(TIME_SEC_FIELD);
    timeSecField.setIntValue(sec);
    doc.add(timeSecField);

    // Set TITLE_FIELD
    String title = docData.getTitle();
    Field titleField = ds.getField(TITLE_FIELD, storeVal, indexVal, termVecVal);
    titleField.setValue(title == null ? "" : title);
    doc.add(titleField);

    String body = docData.getBody();
    if (body != null && body.length() > 0) {
      String bdy;
      if (size <= 0 || size >= body.length()) {
        bdy = body; // use all
        docData.setBody(""); // nothing left
      } else {
        // attempt not to break words - if whitespace found within next 20 chars...
        for (int n = size - 1; n < size + 20 && n < body.length(); n++) {
          if (Character.isWhitespace(body.charAt(n))) {
            size = n;
            break;
          }
        }
        bdy = body.substring(0, size); // use part
        docData.setBody(body.substring(size)); // some left
      }
      Field bodyField = ds.getField(BODY_FIELD, bodyStoreVal, bodyIndexVal, termVecVal);
      bodyField.setValue(bdy);
      doc.add(bodyField);

      if (storeBytes) {
        Field bytesField =
            ds.getField(BYTES_FIELD, Store.YES, Index.NOT_ANALYZED_NO_NORMS, TermVector.NO);
        bytesField.setValue(bdy.getBytes("UTF-8"));
        doc.add(bytesField);
      }
    }

    if (indexProperties) {
      Properties props = docData.getProps();
      if (props != null) {
        for (final Map.Entry<Object, Object> entry : props.entrySet()) {
          Field f = ds.getField((String) entry.getKey(), storeVal, indexVal, termVecVal);
          f.setValue((String) entry.getValue());
          doc.add(f);
        }
        docData.setProps(null);
      }
    }

    // System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n==========");
    return doc;
  }