Пример #1
0
  public FileItem(ResultDocument doc) throws UnsupportedEncodingException, URISyntaxException {
    super(doc);

    setRepository(doc.getValueContent(FileItemFieldEnum.INSTANCE.repository.getName(), 0));
    setDirectory(doc.getValueContent(FileItemFieldEnum.INSTANCE.directory.getName(), 0));
    setHost(doc.getValueContent(FileItemFieldEnum.INSTANCE.host.getName(), 0));
    setSubDirectory(
        FieldValueItem.buildArrayList(
            doc.getValues(FileItemFieldEnum.INSTANCE.subDirectory.getName())));

    setLang(doc.getValueContent(FileItemFieldEnum.INSTANCE.lang.getName(), 0));

    setLangMethod(doc.getValueContent(FileItemFieldEnum.INSTANCE.langMethod.getName(), 0));

    setCrawlDate(doc.getValueContent(FileItemFieldEnum.INSTANCE.crawlDate.getName(), 0));

    setFileExtension(doc.getValueContent(FileItemFieldEnum.INSTANCE.fileExtension.getName(), 0));

    setParser(doc.getValueContent(FileItemFieldEnum.INSTANCE.parser.getName(), 0));

    setTime(doc.getValueContent(FileItemFieldEnum.INSTANCE.time.getName(), 0));

    setUserAllow(
        FieldValueItem.buildArrayList(
            doc.getValues(FileItemFieldEnum.INSTANCE.userAllow.getName())));
    setUserDeny(
        FieldValueItem.buildArrayList(
            doc.getValues(FileItemFieldEnum.INSTANCE.userDeny.getName())));
    setGroupAllow(
        FieldValueItem.buildArrayList(
            doc.getValues(FileItemFieldEnum.INSTANCE.groupAllow.getName())));
    setGroupDeny(
        FieldValueItem.buildArrayList(
            doc.getValues(FileItemFieldEnum.INSTANCE.groupDeny.getName())));
  }
Пример #2
0
  public boolean getSnippets(
      final int docId,
      final ReaderInterface reader,
      final List<FieldValueItem> values,
      final List<FieldValueItem> snippets,
      final Timer parentTimer)
      throws IOException, ParseException, SyntaxError, SearchLibException {

    if (values == null) return false;

    final Timer timer = new Timer(parentTimer, "SnippetField " + this.name);
    final long halfTimeExpiration =
        this.timeLimit == 0 ? 0 : timer.getStartOffset(this.timeLimit / 2);
    final long expiration = this.timeLimit == 0 ? 0 : timer.getStartOffset(this.timeLimit);

    FragmenterAbstract fragmenter = fragmenterTemplate.newInstance();
    SnippetVector currentVector = null;

    Timer t = new Timer(timer, "extractTermVectorIterator");

    Iterator<SnippetVector> vectorIterator =
        SnippetVectors.extractTermVectorIterator(
            docId, reader, snippetQueries, name, values, indexAnalyzer, t, halfTimeExpiration);
    if (vectorIterator != null)
      currentVector = vectorIterator.hasNext() ? vectorIterator.next() : null;

    t.end(null);

    t = new Timer(timer, "getFraments");

    int startOffset = 0;
    FragmentList fragments = new FragmentList();
    int vectorOffset = 0;
    for (FieldValueItem valueItem : values) {
      String value = valueItem.getValue();
      if (value != null) {
        // VectorOffset++ depends of EndOffset bug #patch Lucene 579 and
        // 1458
        fragmenter.getFragments(value, fragments, vectorOffset++);
      }
    }

    t.end(null);

    if (fragments.size() == 0) {
      timer.end(null);
      return false;
    }

    t = new Timer(timer, "checkValue");

    Fragment fragment = fragments.first();
    while (fragment != null) {
      currentVector = checkValue(currentVector, vectorIterator, startOffset, fragment);
      startOffset += fragment.getOriginalText().length();
      fragment = fragment.next();
    }

    t.end(null);

    Timer sbTimer = new Timer(timer, "snippetBuilder");

    boolean result = false;
    int snippetCounter = maxSnippetNumber;
    int scoredFragment = 0;
    while (snippetCounter-- != 0) {
      Fragment bestScoreFragment = null;
      fragment = Fragment.findNextHighlightedFragment(fragments.first());
      List<Fragment> scoreFragments = new ArrayList<Fragment>(0);
      double maxSearchScore = 0;

      t = new Timer(sbTimer, "fragmentScore");
      boolean expired = false;

      while (fragment != null) {
        double sc = fragment.searchScore(name, queryAnalyzer, query);
        if (sc > maxSearchScore) maxSearchScore = sc;
        scoreFragments.add(fragment);
        fragment = Fragment.findNextHighlightedFragment(fragment.next());
        scoredFragment++;
        if (expiration != 0) {
          if (System.currentTimeMillis() > expiration) {
            expired = true;
            break;
          }
        }
      }

      t.end("fragmentScore " + scoredFragment + " " + expired);

      for (Fragment frag : scoreFragments)
        bestScoreFragment =
            Fragment.bestScore(bestScoreFragment, frag, maxSearchScore, maxSnippetSize);

      if (bestScoreFragment != null) {
        SnippetBuilder snippetBuilder =
            new SnippetBuilder(maxSnippetSize, unescapedSeparator, tags, bestScoreFragment);
        if (snippetBuilder.length() > 0)
          snippets.add(new FieldValueItem(FieldValueOriginEnum.SNIPPET, snippetBuilder.toString()));
        fragments.remove(snippetBuilder.getFragments());
        result = true;
        continue;
      }

      if (fragments.first() == null) break;
      SnippetBuilder snippetBuilder =
          new SnippetBuilder(maxSnippetSize, unescapedSeparator, tags, fragments.first());
      if (snippetBuilder.length() > 0) {
        snippets.add(new FieldValueItem(FieldValueOriginEnum.SNIPPET, snippetBuilder.toString()));
        fragments.remove(snippetBuilder.getFragments());
      }
    }

    sbTimer.end(null);

    timer.end(null);

    return result;
  }
Пример #3
0
 private void renderField(ResultDocument doc, ReturnField field) {
   List<FieldValueItem> values = doc.getValues(field);
   if (values == null) return;
   for (FieldValueItem v : values) writer.print(StringEscapeUtils.escapeCsv(v.getValue()));
 }
Пример #4
0
  protected void parseContent(InputStream inputStream)
      throws InstantiationException, IllegalAccessException, ClassNotFoundException, IOException,
          SearchLibException, NoSuchAlgorithmException, URISyntaxException {
    if (parserSelector == null) {
      urlItem.setParserStatus(ParserStatus.NOPARSER);
      return;
    }
    String fileName = urlItem.getContentDispositionFilename();
    if (fileName == null) {
      URL url = urlItem.getURL();
      if (url != null) fileName = FilenameUtils.getName(url.getFile());
    }
    IndexDocument sourceDocument = new IndexDocument();
    urlItem.populate(sourceDocument);
    Date parserStartDate = new Date();
    // TODO Which language for OCR ?
    parser =
        parserSelector.parseStream(
            sourceDocument,
            fileName,
            urlItem.getContentBaseType(),
            urlItem.getUrl(),
            inputStream,
            null,
            parserSelector.getWebCrawlerDefaultParser(),
            parserSelector.getFileCrawlerDefaultParser());
    if (parser == null) {
      urlItem.setParserStatus(ParserStatus.NOPARSER);
      return;
    }

    if (parser.getError() != null) {
      urlItem.setParserStatus(ParserStatus.PARSER_ERROR);
      return;
    }
    urlItem.clearInLinks();
    urlItem.clearOutLinks();

    for (ParserResultItem result : parser.getParserResults()) {
      urlItem.addInLinks(result.getFieldContent(ParserFieldEnum.internal_link));
      urlItem.addInLinks(result.getFieldContent(ParserFieldEnum.internal_link_nofollow));
      urlItem.addOutLinks(result.getFieldContent(ParserFieldEnum.external_link));
      urlItem.addOutLinks(result.getFieldContent(ParserFieldEnum.external_link_nofollow));
      urlItem.setLang(result.getFieldValue(ParserFieldEnum.lang, 0));
      urlItem.setLangMethod(result.getFieldValue(ParserFieldEnum.lang_method, 0));
      urlItem.setContentTypeCharset(result.getFieldValue(ParserFieldEnum.charset, 0));
    }
    ParserStatus parsedStatus = ParserStatus.PARSED;
    if (parser instanceof HtmlParser)
      if (!((HtmlParser) parser).isCanonical()) parsedStatus = ParserStatus.PARSED_NON_CANONICAL;
    urlItem.setParserStatus(parsedStatus);
    String oldMd5size = urlItem.getMd5size();
    String newMd5size = parser.getMd5size();
    urlItem.setMd5size(newMd5size);
    Date oldContentUpdateDate = urlItem.getContentUpdateDate();
    Date newContentUpdateDate = null;
    if (oldContentUpdateDate == null) newContentUpdateDate = parserStartDate;
    else {
      if (oldMd5size != null && newMd5size != null)
        if (!oldMd5size.equals(newMd5size)) newContentUpdateDate = parserStartDate;
    }
    if (newContentUpdateDate != null) urlItem.setContentUpdateDate(newContentUpdateDate);

    for (ParserResultItem result : parser.getParserResults()) {
      FieldContent fieldContent = result.getFieldContent(ParserFieldEnum.meta_robots);
      if (fieldContent != null) {
        List<FieldValueItem> fieldValues = fieldContent.getValues();
        if (fieldValues != null) {
          for (FieldValueItem item :
              result.getFieldContent(ParserFieldEnum.meta_robots).getValues())
            if ("noindex".equalsIgnoreCase(item.getValue())) {
              urlItem.setIndexStatus(IndexStatus.META_NOINDEX);
              break;
            }
        }
      }
    }
  }
Пример #5
0
 private void renderSnippetValue(ResultDocument doc, SnippetField field) {
   List<FieldValueItem> snippets = doc.getSnippetValues(field);
   if (snippets == null) return;
   for (FieldValueItem snippet : snippets)
     writer.print(StringEscapeUtils.escapeCsv(snippet.getValue()));
 }