public FileItem(ResultDocument doc) throws UnsupportedEncodingException, URISyntaxException { super(doc); setRepository(doc.getValueContent(FileItemFieldEnum.INSTANCE.repository.getName(), 0)); setDirectory(doc.getValueContent(FileItemFieldEnum.INSTANCE.directory.getName(), 0)); setHost(doc.getValueContent(FileItemFieldEnum.INSTANCE.host.getName(), 0)); setSubDirectory( FieldValueItem.buildArrayList( doc.getValues(FileItemFieldEnum.INSTANCE.subDirectory.getName()))); setLang(doc.getValueContent(FileItemFieldEnum.INSTANCE.lang.getName(), 0)); setLangMethod(doc.getValueContent(FileItemFieldEnum.INSTANCE.langMethod.getName(), 0)); setCrawlDate(doc.getValueContent(FileItemFieldEnum.INSTANCE.crawlDate.getName(), 0)); setFileExtension(doc.getValueContent(FileItemFieldEnum.INSTANCE.fileExtension.getName(), 0)); setParser(doc.getValueContent(FileItemFieldEnum.INSTANCE.parser.getName(), 0)); setTime(doc.getValueContent(FileItemFieldEnum.INSTANCE.time.getName(), 0)); setUserAllow( FieldValueItem.buildArrayList( doc.getValues(FileItemFieldEnum.INSTANCE.userAllow.getName()))); setUserDeny( FieldValueItem.buildArrayList( doc.getValues(FileItemFieldEnum.INSTANCE.userDeny.getName()))); setGroupAllow( FieldValueItem.buildArrayList( doc.getValues(FileItemFieldEnum.INSTANCE.groupAllow.getName()))); setGroupDeny( FieldValueItem.buildArrayList( doc.getValues(FileItemFieldEnum.INSTANCE.groupDeny.getName()))); }
public boolean getSnippets( final int docId, final ReaderInterface reader, final List<FieldValueItem> values, final List<FieldValueItem> snippets, final Timer parentTimer) throws IOException, ParseException, SyntaxError, SearchLibException { if (values == null) return false; final Timer timer = new Timer(parentTimer, "SnippetField " + this.name); final long halfTimeExpiration = this.timeLimit == 0 ? 0 : timer.getStartOffset(this.timeLimit / 2); final long expiration = this.timeLimit == 0 ? 0 : timer.getStartOffset(this.timeLimit); FragmenterAbstract fragmenter = fragmenterTemplate.newInstance(); SnippetVector currentVector = null; Timer t = new Timer(timer, "extractTermVectorIterator"); Iterator<SnippetVector> vectorIterator = SnippetVectors.extractTermVectorIterator( docId, reader, snippetQueries, name, values, indexAnalyzer, t, halfTimeExpiration); if (vectorIterator != null) currentVector = vectorIterator.hasNext() ? vectorIterator.next() : null; t.end(null); t = new Timer(timer, "getFraments"); int startOffset = 0; FragmentList fragments = new FragmentList(); int vectorOffset = 0; for (FieldValueItem valueItem : values) { String value = valueItem.getValue(); if (value != null) { // VectorOffset++ depends of EndOffset bug #patch Lucene 579 and // 1458 fragmenter.getFragments(value, fragments, vectorOffset++); } } t.end(null); if (fragments.size() == 0) { timer.end(null); return false; } t = new Timer(timer, "checkValue"); Fragment fragment = fragments.first(); while (fragment != null) { currentVector = checkValue(currentVector, vectorIterator, startOffset, fragment); startOffset += fragment.getOriginalText().length(); fragment = fragment.next(); } t.end(null); Timer sbTimer = new Timer(timer, "snippetBuilder"); boolean result = false; int snippetCounter = maxSnippetNumber; int scoredFragment = 0; while (snippetCounter-- != 0) { Fragment bestScoreFragment = null; fragment = Fragment.findNextHighlightedFragment(fragments.first()); List<Fragment> scoreFragments = new ArrayList<Fragment>(0); double maxSearchScore = 0; t = new Timer(sbTimer, "fragmentScore"); boolean expired = false; while (fragment != null) { double sc = fragment.searchScore(name, queryAnalyzer, query); if (sc > maxSearchScore) maxSearchScore = sc; scoreFragments.add(fragment); fragment = Fragment.findNextHighlightedFragment(fragment.next()); scoredFragment++; if (expiration != 0) { if (System.currentTimeMillis() > expiration) { expired = true; break; } } } t.end("fragmentScore " + scoredFragment + " " + expired); for (Fragment frag : scoreFragments) bestScoreFragment = Fragment.bestScore(bestScoreFragment, frag, maxSearchScore, maxSnippetSize); if (bestScoreFragment != null) { SnippetBuilder snippetBuilder = new SnippetBuilder(maxSnippetSize, unescapedSeparator, tags, bestScoreFragment); if (snippetBuilder.length() > 0) snippets.add(new FieldValueItem(FieldValueOriginEnum.SNIPPET, snippetBuilder.toString())); fragments.remove(snippetBuilder.getFragments()); result = true; continue; } if (fragments.first() == null) break; SnippetBuilder snippetBuilder = new SnippetBuilder(maxSnippetSize, unescapedSeparator, tags, fragments.first()); if (snippetBuilder.length() > 0) { snippets.add(new FieldValueItem(FieldValueOriginEnum.SNIPPET, snippetBuilder.toString())); fragments.remove(snippetBuilder.getFragments()); } } sbTimer.end(null); timer.end(null); return result; }
private void renderField(ResultDocument doc, ReturnField field) { List<FieldValueItem> values = doc.getValues(field); if (values == null) return; for (FieldValueItem v : values) writer.print(StringEscapeUtils.escapeCsv(v.getValue())); }
protected void parseContent(InputStream inputStream) throws InstantiationException, IllegalAccessException, ClassNotFoundException, IOException, SearchLibException, NoSuchAlgorithmException, URISyntaxException { if (parserSelector == null) { urlItem.setParserStatus(ParserStatus.NOPARSER); return; } String fileName = urlItem.getContentDispositionFilename(); if (fileName == null) { URL url = urlItem.getURL(); if (url != null) fileName = FilenameUtils.getName(url.getFile()); } IndexDocument sourceDocument = new IndexDocument(); urlItem.populate(sourceDocument); Date parserStartDate = new Date(); // TODO Which language for OCR ? parser = parserSelector.parseStream( sourceDocument, fileName, urlItem.getContentBaseType(), urlItem.getUrl(), inputStream, null, parserSelector.getWebCrawlerDefaultParser(), parserSelector.getFileCrawlerDefaultParser()); if (parser == null) { urlItem.setParserStatus(ParserStatus.NOPARSER); return; } if (parser.getError() != null) { urlItem.setParserStatus(ParserStatus.PARSER_ERROR); return; } urlItem.clearInLinks(); urlItem.clearOutLinks(); for (ParserResultItem result : parser.getParserResults()) { urlItem.addInLinks(result.getFieldContent(ParserFieldEnum.internal_link)); urlItem.addInLinks(result.getFieldContent(ParserFieldEnum.internal_link_nofollow)); urlItem.addOutLinks(result.getFieldContent(ParserFieldEnum.external_link)); urlItem.addOutLinks(result.getFieldContent(ParserFieldEnum.external_link_nofollow)); urlItem.setLang(result.getFieldValue(ParserFieldEnum.lang, 0)); urlItem.setLangMethod(result.getFieldValue(ParserFieldEnum.lang_method, 0)); urlItem.setContentTypeCharset(result.getFieldValue(ParserFieldEnum.charset, 0)); } ParserStatus parsedStatus = ParserStatus.PARSED; if (parser instanceof HtmlParser) if (!((HtmlParser) parser).isCanonical()) parsedStatus = ParserStatus.PARSED_NON_CANONICAL; urlItem.setParserStatus(parsedStatus); String oldMd5size = urlItem.getMd5size(); String newMd5size = parser.getMd5size(); urlItem.setMd5size(newMd5size); Date oldContentUpdateDate = urlItem.getContentUpdateDate(); Date newContentUpdateDate = null; if (oldContentUpdateDate == null) newContentUpdateDate = parserStartDate; else { if (oldMd5size != null && newMd5size != null) if (!oldMd5size.equals(newMd5size)) newContentUpdateDate = parserStartDate; } if (newContentUpdateDate != null) urlItem.setContentUpdateDate(newContentUpdateDate); for (ParserResultItem result : parser.getParserResults()) { FieldContent fieldContent = result.getFieldContent(ParserFieldEnum.meta_robots); if (fieldContent != null) { List<FieldValueItem> fieldValues = fieldContent.getValues(); if (fieldValues != null) { for (FieldValueItem item : result.getFieldContent(ParserFieldEnum.meta_robots).getValues()) if ("noindex".equalsIgnoreCase(item.getValue())) { urlItem.setIndexStatus(IndexStatus.META_NOINDEX); break; } } } } }
private void renderSnippetValue(ResultDocument doc, SnippetField field) { List<FieldValueItem> snippets = doc.getSnippetValues(field); if (snippets == null) return; for (FieldValueItem snippet : snippets) writer.print(StringEscapeUtils.escapeCsv(snippet.getValue())); }