Exemplo n.º 1
1
  /**
   * Add a file to the Lucene index (and generate a xref file)
   *
   * @param file The file to add
   * @param path The path to the file (from source root)
   * @throws java.io.IOException if an error occurs
   */
  private void addFile(File file, String path) throws IOException {
    try (InputStream in = new BufferedInputStream(new FileInputStream(file))) {
      FileAnalyzer fa = AnalyzerGuru.getAnalyzer(in, path);
      for (IndexChangedListener listener : listeners) {
        listener.fileAdd(path, fa.getClass().getSimpleName());
      }
      fa.setCtags(ctags);
      fa.setProject(Project.getProject(path));

      Document d;
      try {
        d = analyzerGuru.getDocument(file, in, path, fa);
      } catch (Exception e) {
        log.log(
            Level.INFO,
            "Skipped file ''{0}'' because the analyzer didn''t " + "understand it.",
            path);
        StringBuilder stack = new StringBuilder();
        for (StackTraceElement ste : e.getStackTrace()) {
          stack.append(ste.toString()).append(System.lineSeparator());
        }
        StringBuilder sstack = new StringBuilder();
        for (Throwable t : e.getSuppressed()) {
          for (StackTraceElement ste : t.getStackTrace()) {
            sstack.append(ste.toString()).append(System.lineSeparator());
          }
        }
        log.log(
            Level.FINE,
            "Exception from analyzer {0}: {1} {2}{3}{4}{5}{6}",
            new String[] {
              fa.getClass().getName(),
              e.toString(),
              System.lineSeparator(),
              stack.toString(),
              System.lineSeparator(),
              sstack.toString()
            });
        return;
      }

      writer.addDocument(d, fa);
      Genre g = fa.getFactory().getGenre();
      if (xrefDir != null && (g == Genre.PLAIN || g == Genre.XREFABLE)) {
        File xrefFile = new File(xrefDir, path);
        // If mkdirs() returns false, the failure is most likely
        // because the file already exists. But to check for the
        // file first and only add it if it doesn't exists would
        // only increase the file IO...
        if (!xrefFile.getParentFile().mkdirs()) {
          assert xrefFile.getParentFile().exists();
        }
        fa.writeXref(xrefDir, path);
      }
      setDirty();
      for (IndexChangedListener listener : listeners) {
        listener.fileAdded(path, fa.getClass().getSimpleName());
      }
    }
  }
  public synchronized ShapeFieldCache<T> getCache(LeafReader reader) throws IOException {
    ShapeFieldCache<T> idx = sidx.get(reader);
    if (idx != null) {
      return idx;
    }
    long startTime = System.currentTimeMillis();

    log.fine("Building Cache [" + reader.maxDoc() + "]");
    idx = new ShapeFieldCache<>(reader.maxDoc(), defaultSize);
    int count = 0;
    DocsEnum docs = null;
    Terms terms = reader.terms(shapeField);
    TermsEnum te = null;
    if (terms != null) {
      te = terms.iterator(te);
      BytesRef term = te.next();
      while (term != null) {
        T shape = readShape(term);
        if (shape != null) {
          docs = te.docs(null, docs, DocsEnum.FLAG_NONE);
          Integer docid = docs.nextDoc();
          while (docid != DocIdSetIterator.NO_MORE_DOCS) {
            idx.add(docid, shape);
            docid = docs.nextDoc();
            count++;
          }
        }
        term = te.next();
      }
    }
    sidx.put(reader, idx);
    long elapsed = System.currentTimeMillis() - startTime;
    log.fine("Cached: [" + count + " in " + elapsed + "ms] " + idx);
    return idx;
  }
 private void initDocument() {
   try {
     _document = DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument();
   } catch (ParserConfigurationException e) {
     e.printStackTrace();
     System.exit(1);
   }
   _rootNode = _document.createElement("body");
   _document.appendChild(_rootNode);
 }
 private String makeStringFromXML() {
   Transformer transformer = null;
   try {
     transformer = TransformerFactory.newInstance().newTransformer();
   } catch (TransformerConfigurationException e) {
     e.printStackTrace();
     System.exit(1);
   }
   transformer.setOutputProperty(OutputKeys.INDENT, "no");
   DOMSource source = new DOMSource(_document);
   StringWriter writer = new StringWriter();
   StreamResult result = new StreamResult(writer);
   try {
     transformer.transform(source, result);
   } catch (TransformerException e) {
     e.printStackTrace();
     System.exit(1);
   }
   return writer.toString();
 }
 public void norms(String f, byte[] norms, int offset) throws IOException {
   byte[] oldNorms = super.norms(f);
   int oldDoc = 0;
   // while (oldDoc < oldNorms.length) { TODO MC
   while (oldDoc < oldToNew.length) {
     int newDoc = oldToNew[oldDoc];
     // if (newDoc != -1) { TODO MC
     // norms[newDoc] = oldNorms[oldDoc]; TODO MC
     System.arraycopy(
         oldNorms, oldDoc * 4, norms, newDoc * 4, 4); // TODO MC copy lengths instead of norms
     // } TODO MC
     oldDoc++;
   }
 }
Exemplo n.º 6
0
  public static void main(String[] args) throws Exception {
    IndexReader reader = null;
    FSDirectory dir = null;
    String field = null;
    boolean IncludeTermFreqs = false;

    if (args.length == 0 || args.length > 4) {
      usage();
      System.exit(1);
    }

    if (args.length > 0) {
      dir = FSDirectory.open(new File(args[0]));
    }

    for (int i = 1; i < args.length; i++) {
      if (args[i].equals("-t")) {
        IncludeTermFreqs = true;
      } else {
        try {
          numTerms = Integer.parseInt(args[i]);
        } catch (NumberFormatException e) {
          field = args[i];
        }
      }
    }
    String[] fields = field != null ? new String[] {field} : null;
    reader = DirectoryReader.open(dir);
    TermStats[] terms = getHighFreqTerms(reader, numTerms, fields);
    if (!IncludeTermFreqs) {
      // default HighFreqTerms behavior
      for (int i = 0; i < terms.length; i++) {
        System.out.printf(
            "%s:%s %,d \n", terms[i].field, terms[i].termtext.utf8ToString(), terms[i].docFreq);
      }
    } else {
      TermStats[] termsWithTF = sortByTotalTermFreq(reader, terms);
      for (int i = 0; i < termsWithTF.length; i++) {
        System.out.printf(
            "%s:%s \t totalTF = %,d \t doc freq = %,d \n",
            termsWithTF[i].field,
            termsWithTF[i].termtext.utf8ToString(),
            termsWithTF[i].totalTermFreq,
            termsWithTF[i].docFreq);
      }
    }
    reader.close();
  }
    public void seek(TermEnum terms) throws IOException {
      original.seek(terms);

      docFreq = terms.docFreq();
      pointer = -1;

      if (docFreq > postingMaps.length) { // grow postingsMap
        PostingMap[] newMap = new PostingMap[docFreq];
        System.arraycopy(postingMaps, 0, newMap, 0, postingMaps.length);
        for (int i = postingMaps.length; i < docFreq; i++) {
          newMap[i] = new PostingMap();
        }
        postingMaps = newMap;
      }

      out.reset();

      int i = 0;
      while (original.next()) {
        PostingMap map = postingMaps[i++];
        map.newDoc = oldToNew[original.doc()]; // remap the newDoc id
        map.offset = out.getFilePointer(); // save pointer to buffer

        final int tf = original.freq(); // buffer tf & positions
        out.writeVInt(tf);
        int prevPosition = 0;
        for (int j = tf; j > 0; j--) { // delta encode positions
          int p = original.nextPosition();
          out.writeVInt(p - prevPosition);
          prevPosition = p;
        }
      }
      out.flush();
      docFreq = i; // allow for deletions

      Arrays.sort(postingMaps, 0, docFreq); // resort by mapped doc ids
      // HeapSorter.sort(postingMaps,docFreq); // TODO MC - due to the lack of space

      // NOTE: this might be substantially faster if RAMInputStream were public
      // and supported a reset() operation.
      in = tempDir.openInput(TEMP_FILE);
    }
Exemplo n.º 8
0
/** Created by muyux on 2016/1/11. */
public class Main {
  static String FILE_SEPARATOR = System.getProperty("file.separator");

  private static String[] getStopWords(String path) {
    try {
      List<String> list = FileUtils.readLines(new File(path), "utf-8");
      System.out.println("==============================================================");
      System.out.println("getStopWords " + list.get(159));
      System.out.println("==============================================================");
      return list.toArray(new String[list.size()]);
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
    }
    return null;
  }

  private static void index_h(String prefix, File file, IndexWriter indexWriter)
      throws IOException {
    Document doc = null;

    if (file.isDirectory()) {
      File files[] = file.listFiles();
      for (File file1 : files) {
        index_h(prefix + FILE_SEPARATOR + file.getName(), file1, indexWriter);
      }
    } else {
      String content = FileUtils.readFileToString(file, "utf-8");

      System.out.println("==============================================================");
      System.out.println("index_h " + content);
      System.out.println("==============================================================");

      String filename = prefix + FILE_SEPARATOR + file.getName();
      String path = file.getAbsolutePath();

      doc = new Document();
      doc.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED));
      doc.add(new Field("relative_path", filename, Field.Store.YES, Field.Index.NOT_ANALYZED));
      indexWriter.addDocument(doc);
    }
  }

  private static Directory index(Analyzer analyzer, String processingPath) {
    RAMDirectory directory = null;
    IndexWriter indexWriter = null;
    try {
      directory = new RAMDirectory();
      IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_35, analyzer);
      indexWriter = new IndexWriter(directory, iwc);
      File file = new File(processingPath);
      index_h("", file, indexWriter);
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
      if (indexWriter != null) {
        try {
          indexWriter.close();
        } catch (CorruptIndexException e1) {
          // TODO Auto-generated catch block
          e1.printStackTrace();
        } catch (IOException e1) {
          // TODO Auto-generated catch block
          e1.printStackTrace();
        }
      }
    }

    return directory;
  }

  private static Map<String, List<String>> generate_result(Directory directory) {
    Map<String, List<String>> result_map = new HashMap<String, List<String>>();

    try {
      IndexReader reader = IndexReader.open(directory);
      TermEnum termEnum = reader.terms();
      while (termEnum.next()) {
        String termEnumString = termEnum.term().toString();
        if (termEnumString.startsWith("content:")) {
          String term = termEnumString.substring(termEnumString.lastIndexOf(":") + 1);
          TermDocs termDocs = reader.termDocs(termEnum.term());
          while (termDocs.next()) {
            Document doc = reader.document(termDocs.doc());
            String relative_path = doc.get("relative_path");

            if (result_map.containsKey(relative_path)) {
              result_map.get(relative_path).add(term + termDocs.freq());
            } else {
              result_map.put(relative_path, new ArrayList<String>());
            }
          }
        }
      }
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
    }

    return result_map;
  }

  public static void main(String[] args) {

    String formatstr = "ws [--in][--out][--dd][--sw][-h]";
    Options opt = new Options();
    // opt.addOption(OptionBuilder.withArgName("in").hasArg().withDescription("search for buildfile
    // towards the root of the filesystem and use it").create("O"));
    opt.addOption(
        OptionBuilder.withLongOpt("in")
            .withDescription("file path of those files need to be processed")
            .withValueSeparator('=')
            .hasArg()
            .create());
    opt.addOption(
        OptionBuilder.withLongOpt("out")
            .withDescription("file path to store result")
            .withValueSeparator('=')
            .hasArg()
            .create());
    opt.addOption(
        OptionBuilder.withLongOpt("dd")
            .withDescription("file path of dictionary")
            .withValueSeparator('=')
            .hasArg()
            .create());
    opt.addOption(
        OptionBuilder.withLongOpt("sw")
            .withDescription("file path of stop words")
            .withValueSeparator('=')
            .hasArg()
            .create());
    opt.addOption("h", "help", false, "print help for the command.");

    if (args.length == 0) {
      HelpFormatter hf = new HelpFormatter();
      hf.printHelp(formatstr, "", opt, "");
      return;
    } else {
      parse_args(args, formatstr, opt);
    }
  }

  private static void parse_args(String[] args, String formatstr, Options opt) {
    HelpFormatter formatter = new HelpFormatter();
    CommandLineParser parser = new PosixParser();
    CommandLine cl = null;

    try {
      cl = parser.parse(opt, args);
    } catch (ParseException e) {
      formatter.printHelp(formatstr, opt); // 如果发生异常,则打印出帮助信息
    }

    if (cl.hasOption("in") && cl.hasOption("out") && cl.hasOption("dd") && cl.hasOption("sw")) {
      String stopWordsPath = cl.getOptionValue("sw");
      String inPath = cl.getOptionValue("in");
      String outPath = cl.getOptionValue("out");
      String dicPath = cl.getOptionValue("dd");
      processOperation(stopWordsPath, inPath, outPath, dicPath);
    } else {
      HelpFormatter hf = new HelpFormatter();
      hf.printHelp(formatstr, "", opt, "");
      return;
    }
  }

  private static void processOperation(
      String stopWordsPath, String inPath, String outPath, String dicPath) {
    Analyzer analyzer = new MyMMSegAnalyzer(new File(dicPath), getStopWords(stopWordsPath));
    Directory directory = index(analyzer, inPath);
    Map<String, List<String>> result_map = generate_result(directory);
    output_result(outPath, result_map);
  }

  private static void output_result(String outPath, Map<String, List<String>> result_map) {
    for (String s : result_map.keySet()) {
      try {
        FileUtils.writeLines(new File(outPath + s), result_map.get(s));
      } catch (IOException e) {
        e.printStackTrace();
      }
    }
  }
}
Exemplo n.º 9
0
  /**
   * Update the content of this index database
   *
   * @throws IOException if an error occurs
   * @throws HistoryException if an error occurs when accessing the history
   */
  public void update() throws IOException, HistoryException {
    synchronized (lock) {
      if (running) {
        throw new IOException("Indexer already running!");
      }
      running = true;
      interrupted = false;
    }

    String ctgs = RuntimeEnvironment.getInstance().getCtags();
    if (ctgs != null) {
      ctags = new Ctags();
      ctags.setBinary(ctgs);
    }
    if (ctags == null) {
      log.severe("Unable to run ctags! searching definitions will not work!");
    }

    if (ctags != null) {
      String filename = RuntimeEnvironment.getInstance().getCTagsExtraOptionsFile();
      if (filename != null) {
        ctags.setCTagsExtraOptionsFile(filename);
      }
    }

    try {
      Analyzer analyzer = AnalyzerGuru.getAnalyzer();
      IndexWriterConfig iwc = new IndexWriterConfig(SearchEngine.LUCENE_VERSION, analyzer);
      iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
      // iwc.setRAMBufferSizeMB(256.0);  //TODO check what is the sweet spot
      writer = new IndexWriter(indexDirectory, iwc);
      writer.commit(); // to make sure index exists on the disk
      // writer.setMaxFieldLength(RuntimeEnvironment.getInstance().getIndexWordLimit());

      if (directories.isEmpty()) {
        if (project == null) {
          directories.add("");
        } else {
          directories.add(project.getPath());
        }
      }

      for (String dir : directories) {
        File sourceRoot;
        if ("".equals(dir)) {
          sourceRoot = RuntimeEnvironment.getInstance().getSourceRootFile();
        } else {
          sourceRoot = new File(RuntimeEnvironment.getInstance().getSourceRootFile(), dir);
        }

        HistoryGuru.getInstance().ensureHistoryCacheExists(sourceRoot);

        String startuid = Util.path2uid(dir, "");
        IndexReader reader = DirectoryReader.open(indexDirectory); // open existing index
        Terms terms = null;
        int numDocs = reader.numDocs();
        if (numDocs > 0) {
          Fields uFields = MultiFields.getFields(reader); // reader.getTermVectors(0);
          terms = uFields.terms(QueryBuilder.U);
        }

        try {
          if (numDocs > 0) {
            uidIter = terms.iterator(null);
            TermsEnum.SeekStatus stat = uidIter.seekCeil(new BytesRef(startuid), true); // init uid
            if (stat == TermsEnum.SeekStatus.END || stat == TermsEnum.SeekStatus.NOT_FOUND) {
              uidIter = null;
            }
          }
          // TODO below should be optional, since it traverses the tree once more to get total
          // count! :(
          int file_cnt = 0;
          if (RuntimeEnvironment.getInstance().isPrintProgress()) {
            log.log(Level.INFO, "Counting files in {0} ...", dir);
            file_cnt = indexDown(sourceRoot, dir, true, 0, 0);
            if (log.isLoggable(Level.INFO)) {
              log.log(
                  Level.INFO, "Need to process: {0} files for {1}", new Object[] {file_cnt, dir});
            }
          }

          indexDown(sourceRoot, dir, false, 0, file_cnt);

          while (uidIter != null
              && uidIter.term() != null
              && uidIter.term().utf8ToString().startsWith(startuid)) {
            removeFile();
            uidIter.next();
          }
        } finally {
          reader.close();
        }
      }
    } finally {
      if (writer != null) {
        try {
          writer.prepareCommit();
          writer.commit();
          writer.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing writer", e);
        }
      }

      if (ctags != null) {
        try {
          ctags.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing ctags process", e);
        }
      }

      synchronized (lock) {
        running = false;
      }
    }

    if (!isInterrupted() && isDirty()) {
      if (RuntimeEnvironment.getInstance().isOptimizeDatabase()) {
        optimize();
      }
      createSpellingSuggestions();
      RuntimeEnvironment env = RuntimeEnvironment.getInstance();
      File timestamp = new File(env.getDataRootFile(), "timestamp");
      if (timestamp.exists()) {
        if (!timestamp.setLastModified(System.currentTimeMillis())) {
          log.log(
              Level.WARNING,
              "Failed to set last modified time on ''{0}'', used for timestamping the index database.",
              timestamp.getAbsolutePath());
        }
      } else {
        if (!timestamp.createNewFile()) {
          log.log(
              Level.WARNING,
              "Failed to create file ''{0}'', used for timestamping the index database.",
              timestamp.getAbsolutePath());
        }
      }
    }
  }
Exemplo n.º 10
0
  public static void main(String[] args) throws IOException {

    IndexReader reader = null;

    /*
     *  Opening the index first simplifies the processing of the
     *  rest of the command line arguments.
     */
    for (int i = 0; i < args.length; i++) {
      if (("-index".equals(args[i])) && ((i + 1) < args.length)) {
        reader = DirectoryReader.open(FSDirectory.open(new File(args[i + 1])));

        if (reader == null) {
          System.err.println("Error:  Can't open index " + args[i + 1]);
          System.exit(1);
        }
        ;

        break;
      }
      ;
    }
    ;

    if (reader == null) {
      System.err.println(usage);
      System.exit(1);
    }
    ;

    /*
     *  Process the command line arguments sequentially.
     */
    for (int i = 0; i < args.length; i++) {

      if ("-index".equals(args[i])) {

        /*
         *  Handled in the previous loop, so just skip the argument.
         */
        i++;

      } else if ("-list-edocid".equals(args[i])) {

        System.out.println("-list-edocid:");

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        Document d = reader.document(Integer.parseInt(args[i + 1]));

        System.out.println(
            "Internal docid --> External docid: " + args[i + 1] + " --> " + d.get("externalId"));

        i += 1;
      } else if ("-list-docids".equals(args[i])) {

        System.out.println("-list-docids:");

        for (int j = 0; j < reader.numDocs(); j++) {
          Document d = reader.document(j);
          System.out.println("Internal --> external docid: " + j + " --> " + d.get("externalId"));
        }
        ;

      } else if ("-list-fields".equals(args[i])) {

        Fields fields = MultiFields.getFields(reader);

        System.out.print("\nNumber of fields:  ");

        if (fields == null) System.out.println("0");
        else {
          System.out.println(fields.size());

          Iterator<String> is = fields.iterator();

          while (is.hasNext()) {
            System.out.println("\t" + is.next());
          }
          ;
        }
        ;

      } else if ("-list-postings".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], Integer.MAX_VALUE);
        i += 2;

      } else if ("-list-postings-sample".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], 5);
        i += 2;

      } else if ("-list-stats".equals(args[i])) {

        System.out.println("Corpus statistics:");
        System.out.println("\tnumdocs\t\t" + reader.numDocs());
        System.out.println(
            "\turl:\t"
                + "\tnumdocs="
                + reader.getDocCount("url")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("url")
                + "\tavglen="
                + reader.getSumTotalTermFreq("url") / (float) reader.getDocCount("url"));

        System.out.println(
            "\tkeywords:"
                + "\tnumdocs="
                + reader.getDocCount("keywords")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("keywords")
                + "\tavglen="
                + reader.getSumTotalTermFreq("keywords") / (float) reader.getDocCount("keywords"));

        System.out.println(
            "\ttitle:\t"
                + "\tnumdocs="
                + reader.getDocCount("title")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("title")
                + "\tavglen="
                + reader.getSumTotalTermFreq("title") / (float) reader.getDocCount("title"));

        System.out.println(
            "\tbody:\t"
                + "\tnumdocs="
                + reader.getDocCount("body")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("body")
                + "\tavglen="
                + reader.getSumTotalTermFreq("body") / (float) reader.getDocCount("body"));

        System.out.println(
            "\tinlink:\t"
                + "\tnumdocs="
                + reader.getDocCount("inlink")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("inlink")
                + "\tavglen="
                + reader.getSumTotalTermFreq("inlink") / (float) reader.getDocCount("inlink"));

      } else if ("-list-terms".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermDictionary(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectors(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector-field".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectorField(reader, args[i + 1], args[i + 2]);
        i += 2;

      } else System.err.println("\nWarning:  Unknown argument " + args[i] + " ignored.");
    }
    ;

    /*
     *  Close the index and exit gracefully.
     */
    reader.close();
  }
Exemplo n.º 11
0
/** A simple utility for inspecting Lucene indexes. Run it to see a simple usage message. */
public class InspectIndex {

  static String usage =
      "Usage:  java "
          + System.getProperty("sun.java.command")
          + " -index INDEX_PATH\n\n"
          + "where options include\n"
          + "    -list-docids\tlist the external docids of each document\n"
          + "    -list-edocid IDOCID\n"
          + "\t\t\tlist the external docid of the document\n"
          + "\t\t\twith internal docid of IDOCID\n"
          + "    -list-fields\tlist the fields in the index\n"
          + "    -list-postings TERM FIELD\n"
          + "\t\t\tdisplay the posting list entries for\n"
          + "\t\t\tterm TERM in field FIELD\n"
          + "    -list-postings-sample TERM FIELD\n"
          + "\t\t\tdisplay the first few posting list entries for\n"
          + "\t\t\tterm TERM in field FIELD\n"
          + "    -list-stats\n"
          + "\t\t\tdisplay corpus statistics\n"
          + "    -list-terms FIELD"
          + "\tdisplay the term dictionary for field FIELD\n"
          + "    -list-termvector DOCID\n"
          + "\t\t\tdisplay the term vectors for all fields in the document\n"
          + "\t\t\twith internal DOCID\n"
          + "    -list-termvector-field DOCID FIELD\n"
          + "\t\t\tdisplay the term vector for FIELD in the document\n"
          + "\t\t\twith internal DOCID\n";

  public static void main(String[] args) throws IOException {

    IndexReader reader = null;

    /*
     *  Opening the index first simplifies the processing of the
     *  rest of the command line arguments.
     */
    for (int i = 0; i < args.length; i++) {
      if (("-index".equals(args[i])) && ((i + 1) < args.length)) {
        reader = DirectoryReader.open(FSDirectory.open(new File(args[i + 1])));

        if (reader == null) {
          System.err.println("Error:  Can't open index " + args[i + 1]);
          System.exit(1);
        }
        ;

        break;
      }
      ;
    }
    ;

    if (reader == null) {
      System.err.println(usage);
      System.exit(1);
    }
    ;

    /*
     *  Process the command line arguments sequentially.
     */
    for (int i = 0; i < args.length; i++) {

      if ("-index".equals(args[i])) {

        /*
         *  Handled in the previous loop, so just skip the argument.
         */
        i++;

      } else if ("-list-edocid".equals(args[i])) {

        System.out.println("-list-edocid:");

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        Document d = reader.document(Integer.parseInt(args[i + 1]));

        System.out.println(
            "Internal docid --> External docid: " + args[i + 1] + " --> " + d.get("externalId"));

        i += 1;
      } else if ("-list-docids".equals(args[i])) {

        System.out.println("-list-docids:");

        for (int j = 0; j < reader.numDocs(); j++) {
          Document d = reader.document(j);
          System.out.println("Internal --> external docid: " + j + " --> " + d.get("externalId"));
        }
        ;

      } else if ("-list-fields".equals(args[i])) {

        Fields fields = MultiFields.getFields(reader);

        System.out.print("\nNumber of fields:  ");

        if (fields == null) System.out.println("0");
        else {
          System.out.println(fields.size());

          Iterator<String> is = fields.iterator();

          while (is.hasNext()) {
            System.out.println("\t" + is.next());
          }
          ;
        }
        ;

      } else if ("-list-postings".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], Integer.MAX_VALUE);
        i += 2;

      } else if ("-list-postings-sample".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listPostings(reader, args[i + 1], args[i + 2], 5);
        i += 2;

      } else if ("-list-stats".equals(args[i])) {

        System.out.println("Corpus statistics:");
        System.out.println("\tnumdocs\t\t" + reader.numDocs());
        System.out.println(
            "\turl:\t"
                + "\tnumdocs="
                + reader.getDocCount("url")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("url")
                + "\tavglen="
                + reader.getSumTotalTermFreq("url") / (float) reader.getDocCount("url"));

        System.out.println(
            "\tkeywords:"
                + "\tnumdocs="
                + reader.getDocCount("keywords")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("keywords")
                + "\tavglen="
                + reader.getSumTotalTermFreq("keywords") / (float) reader.getDocCount("keywords"));

        System.out.println(
            "\ttitle:\t"
                + "\tnumdocs="
                + reader.getDocCount("title")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("title")
                + "\tavglen="
                + reader.getSumTotalTermFreq("title") / (float) reader.getDocCount("title"));

        System.out.println(
            "\tbody:\t"
                + "\tnumdocs="
                + reader.getDocCount("body")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("body")
                + "\tavglen="
                + reader.getSumTotalTermFreq("body") / (float) reader.getDocCount("body"));

        System.out.println(
            "\tinlink:\t"
                + "\tnumdocs="
                + reader.getDocCount("inlink")
                + "\tsumTotalTF="
                + reader.getSumTotalTermFreq("inlink")
                + "\tavglen="
                + reader.getSumTotalTermFreq("inlink") / (float) reader.getDocCount("inlink"));

      } else if ("-list-terms".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermDictionary(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector".equals(args[i])) {

        if ((i + 1) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectors(reader, args[i + 1]);
        i += 1;

      } else if ("-list-termvector-field".equals(args[i])) {

        if ((i + 2) >= args.length) {
          System.out.println(usage);
          break;
        }
        ;

        listTermVectorField(reader, args[i + 1], args[i + 2]);
        i += 2;

      } else System.err.println("\nWarning:  Unknown argument " + args[i] + " ignored.");
    }
    ;

    /*
     *  Close the index and exit gracefully.
     */
    reader.close();
  }

  /*
   *  listPostings displays the first n postings for a term in a
   *  field in an index (specified by reader).  Set n to MAX_VALUE
   *  to display all postings.
   */
  static void listPostings(IndexReader reader, String termString, String field, Integer n)
      throws IOException {

    System.out.println("\nPostings:  " + termString + " " + field);

    /*
     *  Prepare to access the index.
     */
    BytesRef termBytes = new BytesRef(termString);
    Term term = new Term(field, termBytes);
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    /*
     *  Lookup the collection term frequency (ctf).
     */
    long df = reader.docFreq(term);
    System.out.println("\tdf:  " + df);

    long ctf = reader.totalTermFreq(term);
    System.out.println("\tctf:  " + ctf);

    if (df < 1) return;

    /*
     *  Lookup the inverted list.
     */
    DocsAndPositionsEnum postings =
        MultiFields.getTermPositionsEnum(reader, liveDocs, field, termBytes);

    /*
     *  Iterate through the first n postings.
     */
    long count = 0;

    while ((count < n) && (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS)) {

      System.out.println("\tdocid: " + postings.docID());
      int tf = postings.freq();
      System.out.println("\ttf: " + tf);
      System.out.print("\tPositions: ");

      for (int j = 0; j < tf; j++) {
        int pos = postings.nextPosition();
        System.out.print(pos + " ");
      }

      System.out.println("");

      count++;
    }
    ;

    return;
  }

  /*
   *  listTermDictionary displays the term dictionary for a field.
   */
  static void listTermDictionary(IndexReader reader, String fieldName) throws IOException {

    System.out.println("\nTerm Dictionary:  field " + fieldName);

    /*
      Grant says:
      MultiFields.getTerms(IndexReader, fieldName)
    */

    Terms terms = MultiFields.getTerms(reader, fieldName);

    if ((terms == null) || (terms.size() == -1))
      System.out.println("    The term dictionary is empty.");
    else {
      System.out.println("    Vocabulary size: " + terms.size() + " terms");

      TermsEnum ithTerm = terms.iterator(null);

      /*
       *  Iterate over the terms in this document.
       *  Information about a term's occurrences (tf and
       *  positions) is accessed via the indexing API, which
       *  returns inverted lists that describe (only) the
       *  current document.
       */
      while (ithTerm.next() != null) {
        System.out.format(
            "      %-30s %d %d\n",
            ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq());
      }
      ;
    }
    ;
  }

  /*
   *  listTermVectors displays the term vectors for all of the fields
   *  in a document in an index (specified by reader).
   */
  static void listTermVectors(IndexReader reader, String docidString) throws IOException {

    System.out.println("\nTermVector:  docid " + docidString);

    int docid = Integer.parseInt(docidString);

    if ((docid < 0) || (docid >= reader.numDocs())) {
      System.out.println("ERROR:  " + docidString + " is a bad document id.");
      return;
    }
    ;

    /*
     *  Iterate over the fields in this document.
     */
    Fields fields = reader.getTermVectors(docid);
    Iterator<String> fieldIterator = fields.iterator();

    while (fieldIterator.hasNext()) {
      String fieldName = fieldIterator.next();
      System.out.println("  Field: " + fieldName);

      Terms terms = fields.terms(fieldName);
      termVectorDisplay(terms);
    }
    ;
  }

  /*
   *  listTermVectorField displays the term vector for a field in
   *  a document in an index (specified by reader).
   */
  static void listTermVectorField(IndexReader reader, String docidString, String field)
      throws IOException {

    System.out.println("\nTermVector:  docid " + docidString + ", field " + field);

    int docid = Integer.parseInt(docidString);

    if ((docid < 0) || (docid >= reader.numDocs())) {
      System.out.println("ERROR:  " + docidString + " is a bad document id.");
      return;
    }
    ;

    Terms terms = reader.getTermVector(docid, field);
    termVectorDisplay(terms);
  }

  /*
   *  Utility function to display a term vector.
   */
  static void termVectorDisplay(Terms terms) throws IOException {

    if ((terms == null) || (terms.size() == -1)) System.out.println("    The field is not stored.");
    else {
      /*
       *  The terms for this field are stored.
       */
      System.out.println("    Vocabulary size: " + terms.size() + " terms");

      TermsEnum ithTerm = terms.iterator(null);

      /*
       *  Iterate over the terms in this document.
       *  Information about a term's occurrences (tf and
       *  positions) is accessed via the indexing API, which
       *  returns inverted lists that describe (only) the
       *  current document.
       */
      while (ithTerm.next() != null) {
        System.out.format(
            "      %10d %-20s %d ",
            ithTerm.ord(), ithTerm.term().utf8ToString(), ithTerm.totalTermFreq());

        DocsAndPositionsEnum currDoc = ithTerm.docsAndPositions(null, null);
        currDoc.nextDoc();

        for (int jthPosition = 0; jthPosition < ithTerm.totalTermFreq(); jthPosition++)
          System.out.print(currDoc.nextPosition() + " ");

        System.out.println();
      }
      ;
    }
    ;
  }
}
 public static void main(String[] args) throws Exception {
   int res = new IndexSorterArquivoWeb().doMain(NutchConfiguration.create(), args);
   System.exit(res);
 }
Exemplo n.º 13
0
  public static void main(String[] args) throws Exception {
    if (args.length < 2) {
      System.err.println("TermDumper [-c|-v value] field <index...>");
      System.exit(1);
    }

    boolean count = false;
    String value = null;
    boolean all = false;

    int i = 0;
    for (; i < args.length; i++) {
      String arg = args[i];

      if ("-h".equals(arg) || "--help".equals(arg)) {
        System.err.println("TermDumper [-c|-v value] field <index...>");
        System.exit(1);
      } else if ("-c".equals(arg) || "--count".equals(arg)) {
        count = true;
      } else if ("-v".equals(arg) || "--vaue".equals(arg)) {
        value = args[++i];
      } else if ("-a".equals(arg) || "--all".equals(arg)) {
        all = true;
      } else {
        break;
      }
    }

    String field = args[i++];

    java.util.ArrayList<IndexReader> readers =
        new java.util.ArrayList<IndexReader>(args.length - 1);
    for (; i < args.length; i++) {
      String arg = args[i];
      try {
        IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true);

        readers.add(reader);
      } catch (IOException ioe) {
        System.err.println("Error reading: " + arg);
      }
    }

    for (IndexReader reader : readers) {
      TermDocs termDocs = reader.termDocs();
      TermEnum termEnum = reader.terms(new Term(field));

      try {
        do {
          Term term = termEnum.term();

          if (term == null || !field.equals(term.field())) break;

          if (value == null) {
            if (count) {
              termDocs.seek(termEnum);

              int c = 0;
              for (; termDocs.next(); c++) ;

              System.out.print(c + " ");
            }
            System.out.println(term.text());
          } else if (value.equals(term.text())) {
            termDocs.seek(termEnum);

            while (termDocs.next()) {
              if (all) {
                Document d = reader.document(termDocs.doc());
                System.out.println(termDocs.doc());
                for (Object o : d.getFields()) {
                  Field f = (Field) o;
                  System.out.println(f.name() + " " + d.get(f.name()));
                }
              } else {
                System.out.println(
                    termDocs.doc() + " " + reader.document(termDocs.doc()).get("url"));
              }
            }
          }
        } while (termEnum.next());
      } finally {
        termDocs.close();
        termEnum.close();
      }
    }
  }