/** * Populate a Lucene document with the required fields. * * @param doc The document to populate * @param file The file to index * @param path Where the file is located (from source root) * @param fa The analyzer to use on the file * @param xrefOut Where to write the xref (possibly {@code null}) * @throws IOException If an exception occurs while collecting the data */ public void populateDocument( Document doc, File file, String path, FileAnalyzer fa, Writer xrefOut) throws IOException { String date = DateTools.timeToString(file.lastModified(), DateTools.Resolution.MILLISECOND); doc.add(new Field(QueryBuilder.U, Util.path2uid(path, date), string_ft_stored_nanalyzed_norms)); doc.add( new Field( QueryBuilder.FULLPATH, file.getAbsolutePath(), string_ft_nstored_nanalyzed_norms)); doc.add(new SortedDocValuesField(QueryBuilder.FULLPATH, new BytesRef(file.getAbsolutePath()))); try { HistoryReader hr = HistoryGuru.getInstance().getHistoryReader(file); if (hr != null) { doc.add(new TextField(QueryBuilder.HIST, hr)); // date = hr.getLastCommentDate() //RFE } } catch (HistoryException e) { LOGGER.log(Level.WARNING, "An error occurred while reading history: ", e); } doc.add(new Field(QueryBuilder.DATE, date, string_ft_stored_nanalyzed_norms)); doc.add(new SortedDocValuesField(QueryBuilder.DATE, new BytesRef(date))); if (path != null) { doc.add(new TextField(QueryBuilder.PATH, path, Store.YES)); Project project = Project.getProject(path); if (project != null) { doc.add(new TextField(QueryBuilder.PROJECT, project.getPath(), Store.YES)); } } if (fa != null) { Genre g = fa.getGenre(); if (g == Genre.PLAIN || g == Genre.XREFABLE || g == Genre.HTML) { doc.add(new Field(QueryBuilder.T, g.typeName(), string_ft_stored_nanalyzed_norms)); } fa.analyze(doc, StreamSource.fromFile(file), xrefOut); String type = fa.getFileTypeName(); doc.add(new StringField(QueryBuilder.TYPE, type, Store.YES)); } }
/** * Creates Lucene document with the fields: * * <ul> * <li>path: relative path from the constructor * <li>id: the same as path * <li>modified: last modified date of the file * <li>filesize: size of the file * <li>title: name of the file * </ul> * * @return New Lucene document. */ @Override public Document createDocument() { Document doc = new Document(); doc.add(new StringField("path", path.toString(), Field.Store.YES)); doc.add(new StringField("id", path.toString(), Field.Store.YES)); try { doc.add( new StringField( "modified", DateTools.timeToString( Files.getLastModifiedTime(file).toMillis(), DateTools.Resolution.MINUTE), Field.Store.YES)); doc.add(new LongField("filesize", Files.size(file), Field.Store.YES)); } catch (IOException ex) { LOG.error(ex); } doc.add(new TextField("title", file.getFileName().toString(), Field.Store.YES)); return doc; }
/** * Add a row to the index. * * @param row the row * @param commitIndex whether to commit the changes to the Lucene index */ protected void insert(Object[] row, boolean commitIndex) throws SQLException { /*## LUCENE2 ## String query = getQuery(row); Document doc = new Document(); doc.add(new Field(LUCENE_FIELD_QUERY, query, Field.Store.YES, Field.Index.UN_TOKENIZED)); long time = System.currentTimeMillis(); doc.add(new Field(LUCENE_FIELD_MODIFIED, DateTools.timeToString(time, DateTools.Resolution.SECOND), Field.Store.YES, Field.Index.UN_TOKENIZED)); StatementBuilder buff = new StatementBuilder(); for (int index : indexColumns) { String columnName = columns[index]; String data = asString(row[index], columnTypes[index]); // column names that start with _ must be escaped to avoid conflicts // with internal field names (_DATA, _QUERY, _modified) if (columnName.startsWith(LUCENE_FIELD_COLUMN_PREFIX)) { columnName = LUCENE_FIELD_COLUMN_PREFIX + columnName; } doc.add(new Field(columnName, data, Field.Store.NO, Field.Index.TOKENIZED)); buff.appendExceptFirst(" "); buff.append(data); } Field.Store storeText = STORE_DOCUMENT_TEXT_IN_INDEX ? Field.Store.YES : Field.Store.NO; doc.add(new Field(LUCENE_FIELD_DATA, buff.toString(), storeText, Field.Index.TOKENIZED)); try { indexAccess.modifier.addDocument(doc); } catch (IOException e) { throw convertException(e); } //*/ // ## LUCENE3 ## String query = getQuery(row); Document doc = new Document(); doc.add(new Field(LUCENE_FIELD_QUERY, query, Field.Store.YES, Field.Index.NOT_ANALYZED)); long time = System.currentTimeMillis(); doc.add( new Field( LUCENE_FIELD_MODIFIED, DateTools.timeToString(time, DateTools.Resolution.SECOND), Field.Store.YES, Field.Index.NOT_ANALYZED)); StatementBuilder buff = new StatementBuilder(); for (int index : indexColumns) { String columnName = columns[index]; String data = asString(row[index], columnTypes[index]); // column names that start with _ // must be escaped to avoid conflicts // with internal field names (_DATA, _QUERY, _modified) if (columnName.startsWith(LUCENE_FIELD_COLUMN_PREFIX)) { columnName = LUCENE_FIELD_COLUMN_PREFIX + columnName; } doc.add(new Field(columnName, data, Field.Store.NO, Field.Index.ANALYZED)); buff.appendExceptFirst(" "); buff.append(data); } Field.Store storeText = STORE_DOCUMENT_TEXT_IN_INDEX ? Field.Store.YES : Field.Store.NO; doc.add(new Field(LUCENE_FIELD_DATA, buff.toString(), storeText, Field.Index.ANALYZED)); try { indexAccess.writer.addDocument(doc); if (commitIndex) { commitIndex(); } } catch (IOException e) { throw convertException(e); } // */ }
/** * Generate indexes recursively * * @param dir the root indexDirectory to generate indexes for * @param path the path * @param count_only if true will just traverse the source root and count files * @param cur_count current count during the traversal of the tree * @param est_total estimate total files to process */ private int indexDown(File dir, String parent, boolean count_only, int cur_count, int est_total) throws IOException { int lcur_count = cur_count; if (isInterrupted()) { return lcur_count; } if (!accept(dir)) { return lcur_count; } File[] files = dir.listFiles(); if (files == null) { log.log(Level.SEVERE, "Failed to get file listing for: {0}", dir.getAbsolutePath()); return lcur_count; } Arrays.sort( files, new Comparator<File>() { @Override public int compare(File p1, File p2) { return p1.getName().compareTo(p2.getName()); } }); for (File file : files) { if (accept(dir, file)) { String path = parent + '/' + file.getName(); if (file.isDirectory()) { lcur_count = indexDown(file, path, count_only, lcur_count, est_total); } else { lcur_count++; if (count_only) { continue; } if (RuntimeEnvironment.getInstance().isPrintProgress() && est_total > 0 && log.isLoggable(Level.INFO)) { log.log( Level.INFO, "Progress: {0} ({1}%)", new Object[] {lcur_count, (lcur_count * 100.0f / est_total)}); } if (uidIter != null) { String uid = Util.path2uid( path, DateTools.timeToString( file.lastModified(), DateTools.Resolution.MILLISECOND)); // construct uid for doc BytesRef buid = new BytesRef(uid); while (uidIter.term() != null && uidIter.term().compareTo(emptyBR) != 0 && uidIter.term().compareTo(buid) < 0) { removeFile(); uidIter.next(); } if (uidIter.term() != null && uidIter.term().bytesEquals(buid)) { uidIter.next(); // keep matching docs continue; } } try { addFile(file, path); } catch (Exception e) { log.log(Level.WARNING, "Failed to add file " + file.getAbsolutePath(), e); } } } } return lcur_count; }
/** * Index the fileset. * * @exception IOException if Lucene I/O exception TODO: refactor!!!!! */ private void indexDocs() throws IOException { Date start = new Date(); boolean create = overwrite; // If the index directory doesn't exist, // create it and force create mode if (indexDir.mkdirs() && !overwrite) { create = true; } FSDirectory dir = FSDirectory.open(indexDir); try { Searcher searcher = null; boolean checkLastModified = false; if (!create) { try { searcher = new IndexSearcher(dir, true); checkLastModified = true; } catch (IOException ioe) { log("IOException: " + ioe.getMessage()); // Empty - ignore, which indicates to index all // documents } } log("checkLastModified = " + checkLastModified, Project.MSG_VERBOSE); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer) .setOpenMode(create ? OpenMode.CREATE : OpenMode.APPEND); LogMergePolicy lmp = (LogMergePolicy) conf.getMergePolicy(); lmp.setUseCompoundFile(useCompoundIndex); lmp.setMergeFactor(mergeFactor); IndexWriter writer = new IndexWriter(dir, conf); int totalFiles = 0; int totalIndexed = 0; int totalIgnored = 0; try { for (int i = 0; i < rcs.size(); i++) { ResourceCollection rc = rcs.elementAt(i); if (rc.isFilesystemOnly()) { Iterator resources = rc.iterator(); while (resources.hasNext()) { Resource r = (Resource) resources.next(); if (!r.isExists() || !(r instanceof FileResource)) { continue; } totalFiles++; File file = ((FileResource) r).getFile(); if (!file.exists() || !file.canRead()) { throw new BuildException( "File \"" + file.getAbsolutePath() + "\" does not exist or is not readable."); } boolean indexIt = true; if (checkLastModified) { Term pathTerm = new Term("path", file.getPath()); TermQuery query = new TermQuery(pathTerm); ScoreDoc[] hits = searcher.search(query, null, 1).scoreDocs; // if document is found, compare the // indexed last modified time with the // current file // - don't index if up to date if (hits.length > 0) { Document doc = searcher.doc(hits[0].doc); String indexModified = doc.get("modified").trim(); if (indexModified != null) { long lastModified = 0; try { lastModified = DateTools.stringToTime(indexModified); } catch (ParseException e) { // if modified time is not parsable, skip } if (lastModified == file.lastModified()) { // TODO: remove existing document indexIt = false; } } } } if (indexIt) { try { log("Indexing " + file.getPath(), Project.MSG_VERBOSE); Document doc = handler.getDocument(file); if (doc == null) { totalIgnored++; } else { // Add the path of the file as a field named "path". Use a Keyword field, so // that the index stores the path, and so that the path is searchable doc.add( new Field( "path", file.getPath(), Field.Store.YES, Field.Index.NOT_ANALYZED)); // Add the last modified date of the file a field named "modified". Use a // Keyword field, so that it's searchable, but so that no attempt is made // to tokenize the field into words. doc.add( new Field( "modified", DateTools.timeToString( file.lastModified(), DateTools.Resolution.MILLISECOND), Field.Store.YES, Field.Index.NOT_ANALYZED)); writer.addDocument(doc); totalIndexed++; } } catch (DocumentHandlerException e) { throw new BuildException(e); } } } // for j } // if (fs != null) } // for i writer.optimize(); } // try finally { // always make sure everything gets closed, // no matter how we exit. writer.close(); if (searcher != null) { searcher.close(); } } Date end = new Date(); log( totalIndexed + " out of " + totalFiles + " indexed (" + totalIgnored + " ignored) in " + (end.getTime() - start.getTime()) + " milliseconds"); } finally { dir.close(); } }
private static String timeToString(long time) { return DateTools.timeToString(time, DATE_TIME_RES); }
private Query createQuery(SearchEngineFilter filter) { BooleanQuery fieldQuery = new BooleanQuery(); String key = filter.getKey(); String attachmentKey = key + IIndexerDAO.ATTACHMENT_FIELD_SUFFIX; Object value = filter.getValue(); if (null != value) { if (value instanceof String) { SearchEngineFilter.TextSearchOption option = filter.getTextSearchOption(); if (null == option) { option = SearchEngineFilter.TextSearchOption.AT_LEAST_ONE_WORD; } String stringValue = value.toString(); String[] values = stringValue.split("\\s+"); if (!option.equals(SearchEngineFilter.TextSearchOption.EXACT)) { BooleanClause.Occur bc = BooleanClause.Occur.SHOULD; if (option.equals(SearchEngineFilter.TextSearchOption.ALL_WORDS)) { bc = BooleanClause.Occur.MUST; } else if (option.equals(SearchEngineFilter.TextSearchOption.ANY_WORD)) { bc = BooleanClause.Occur.MUST_NOT; } for (int i = 0; i < values.length; i++) { TermQuery term = new TermQuery(new Term(key, values[i].toLowerCase())); // NOTE: search lower case.... if (filter.isIncludeAttachments()) { BooleanQuery compositeQuery = new BooleanQuery(); compositeQuery.add(term, BooleanClause.Occur.SHOULD); TermQuery termAttachment = new TermQuery(new Term(attachmentKey, values[i].toLowerCase())); compositeQuery.add(termAttachment, BooleanClause.Occur.SHOULD); fieldQuery.add(compositeQuery, bc); } else { fieldQuery.add(term, bc); } } } else { PhraseQuery phraseQuery = new PhraseQuery(); for (int i = 0; i < values.length; i++) { // NOTE: search lower case.... phraseQuery.add(new Term(key, values[i].toLowerCase())); } if (filter.isIncludeAttachments()) { fieldQuery.add(phraseQuery, BooleanClause.Occur.SHOULD); PhraseQuery phraseQuery2 = new PhraseQuery(); for (int i = 0; i < values.length; i++) { // NOTE: search lower case.... phraseQuery2.add(new Term(attachmentKey, values[i].toLowerCase())); } fieldQuery.add(phraseQuery2, BooleanClause.Occur.SHOULD); } else { return phraseQuery; } } } else if (value instanceof Date) { String toString = DateTools.timeToString(((Date) value).getTime(), DateTools.Resolution.MINUTE); TermQuery term = new TermQuery(new Term(filter.getKey(), toString)); fieldQuery.add(term, BooleanClause.Occur.MUST); } else if (value instanceof Number) { TermQuery term = new TermQuery(new Term(filter.getKey(), value.toString())); fieldQuery.add(term, BooleanClause.Occur.MUST); } } else { if (filter.getStart() instanceof Number || filter.getEnd() instanceof Number) { // .............................. TODO } else { String start = null; String end = null; if (filter.getStart() instanceof Date || filter.getEnd() instanceof Date) { if (null != filter.getStart()) { start = DateTools.timeToString( ((Date) filter.getStart()).getTime(), DateTools.Resolution.MINUTE); } if (null != filter.getEnd()) { end = DateTools.timeToString( ((Date) filter.getEnd()).getTime(), DateTools.Resolution.MINUTE); } } else { start = (null != filter.getStart()) ? filter.getStart().toString().toLowerCase() : null; end = (null != filter.getEnd()) ? filter.getEnd().toString().toLowerCase() : null; } BytesRef byteStart = (null != start) ? new BytesRef(start.getBytes()) : null; BytesRef byteEnd = (null != end) ? new BytesRef(end.getBytes()) : null; TermRangeQuery range = new TermRangeQuery(filter.getKey(), byteStart, byteEnd, true, true); fieldQuery.add(range, BooleanClause.Occur.MUST); } } return fieldQuery; }