Esempio n. 1
0
  public List<IndexDocument> getTargetIndexDocuments()
      throws SearchLibException, IOException, URISyntaxException {
    synchronized (this) {
      if (targetIndexDocuments != null) return targetIndexDocuments;

      targetIndexDocuments = new ArrayList<IndexDocument>(0);

      if (parser == null) return targetIndexDocuments;

      List<ParserResultItem> results = parser.getParserResults();
      if (results == null) return targetIndexDocuments;

      for (ParserResultItem result : results) {
        IndexDocument targetIndexDocument =
            new IndexDocument(LanguageEnum.findByCode(urlItem.getLang()));

        IndexDocument urlIndexDocument = new IndexDocument();
        urlItem.populate(urlIndexDocument);
        urlFieldMap.mapIndexDocument(urlIndexDocument, targetIndexDocument);

        if (result != null) result.populate(targetIndexDocument);

        IndexPluginList indexPluginList = config.getWebCrawlMaster().getIndexPluginList();

        if (indexPluginList != null) {
          if (!indexPluginList.run(
              (Client) config, getContentType(), getStreamLimiter(), targetIndexDocument)) {
            urlItem.setIndexStatus(IndexStatus.PLUGIN_REJECTED);
            urlItem.populate(urlIndexDocument);
            continue;
          }
        }

        targetIndexDocuments.add(targetIndexDocument);
      }
      return targetIndexDocuments;
    }
  }
Esempio n. 2
0
  protected void parseContent(InputStream inputStream)
      throws InstantiationException, IllegalAccessException, ClassNotFoundException, IOException,
          SearchLibException, NoSuchAlgorithmException, URISyntaxException {
    if (parserSelector == null) {
      urlItem.setParserStatus(ParserStatus.NOPARSER);
      return;
    }
    String fileName = urlItem.getContentDispositionFilename();
    if (fileName == null) {
      URL url = urlItem.getURL();
      if (url != null) fileName = FilenameUtils.getName(url.getFile());
    }
    IndexDocument sourceDocument = new IndexDocument();
    urlItem.populate(sourceDocument);
    Date parserStartDate = new Date();
    // TODO Which language for OCR ?
    parser =
        parserSelector.parseStream(
            sourceDocument,
            fileName,
            urlItem.getContentBaseType(),
            urlItem.getUrl(),
            inputStream,
            null,
            parserSelector.getWebCrawlerDefaultParser(),
            parserSelector.getFileCrawlerDefaultParser());
    if (parser == null) {
      urlItem.setParserStatus(ParserStatus.NOPARSER);
      return;
    }

    if (parser.getError() != null) {
      urlItem.setParserStatus(ParserStatus.PARSER_ERROR);
      return;
    }
    urlItem.clearInLinks();
    urlItem.clearOutLinks();

    for (ParserResultItem result : parser.getParserResults()) {
      urlItem.addInLinks(result.getFieldContent(ParserFieldEnum.internal_link));
      urlItem.addInLinks(result.getFieldContent(ParserFieldEnum.internal_link_nofollow));
      urlItem.addOutLinks(result.getFieldContent(ParserFieldEnum.external_link));
      urlItem.addOutLinks(result.getFieldContent(ParserFieldEnum.external_link_nofollow));
      urlItem.setLang(result.getFieldValue(ParserFieldEnum.lang, 0));
      urlItem.setLangMethod(result.getFieldValue(ParserFieldEnum.lang_method, 0));
      urlItem.setContentTypeCharset(result.getFieldValue(ParserFieldEnum.charset, 0));
    }
    ParserStatus parsedStatus = ParserStatus.PARSED;
    if (parser instanceof HtmlParser)
      if (!((HtmlParser) parser).isCanonical()) parsedStatus = ParserStatus.PARSED_NON_CANONICAL;
    urlItem.setParserStatus(parsedStatus);
    String oldMd5size = urlItem.getMd5size();
    String newMd5size = parser.getMd5size();
    urlItem.setMd5size(newMd5size);
    Date oldContentUpdateDate = urlItem.getContentUpdateDate();
    Date newContentUpdateDate = null;
    if (oldContentUpdateDate == null) newContentUpdateDate = parserStartDate;
    else {
      if (oldMd5size != null && newMd5size != null)
        if (!oldMd5size.equals(newMd5size)) newContentUpdateDate = parserStartDate;
    }
    if (newContentUpdateDate != null) urlItem.setContentUpdateDate(newContentUpdateDate);

    for (ParserResultItem result : parser.getParserResults()) {
      FieldContent fieldContent = result.getFieldContent(ParserFieldEnum.meta_robots);
      if (fieldContent != null) {
        List<FieldValueItem> fieldValues = fieldContent.getValues();
        if (fieldValues != null) {
          for (FieldValueItem item :
              result.getFieldContent(ParserFieldEnum.meta_robots).getValues())
            if ("noindex".equalsIgnoreCase(item.getValue())) {
              urlItem.setIndexStatus(IndexStatus.META_NOINDEX);
              break;
            }
        }
      }
    }
  }