public List<IndexDocument> getTargetIndexDocuments() throws SearchLibException, IOException, URISyntaxException { synchronized (this) { if (targetIndexDocuments != null) return targetIndexDocuments; targetIndexDocuments = new ArrayList<IndexDocument>(0); if (parser == null) return targetIndexDocuments; List<ParserResultItem> results = parser.getParserResults(); if (results == null) return targetIndexDocuments; for (ParserResultItem result : results) { IndexDocument targetIndexDocument = new IndexDocument(LanguageEnum.findByCode(urlItem.getLang())); IndexDocument urlIndexDocument = new IndexDocument(); urlItem.populate(urlIndexDocument); urlFieldMap.mapIndexDocument(urlIndexDocument, targetIndexDocument); if (result != null) result.populate(targetIndexDocument); IndexPluginList indexPluginList = config.getWebCrawlMaster().getIndexPluginList(); if (indexPluginList != null) { if (!indexPluginList.run( (Client) config, getContentType(), getStreamLimiter(), targetIndexDocument)) { urlItem.setIndexStatus(IndexStatus.PLUGIN_REJECTED); urlItem.populate(urlIndexDocument); continue; } } targetIndexDocuments.add(targetIndexDocument); } return targetIndexDocuments; } }
protected void parseContent(InputStream inputStream) throws InstantiationException, IllegalAccessException, ClassNotFoundException, IOException, SearchLibException, NoSuchAlgorithmException, URISyntaxException { if (parserSelector == null) { urlItem.setParserStatus(ParserStatus.NOPARSER); return; } String fileName = urlItem.getContentDispositionFilename(); if (fileName == null) { URL url = urlItem.getURL(); if (url != null) fileName = FilenameUtils.getName(url.getFile()); } IndexDocument sourceDocument = new IndexDocument(); urlItem.populate(sourceDocument); Date parserStartDate = new Date(); // TODO Which language for OCR ? parser = parserSelector.parseStream( sourceDocument, fileName, urlItem.getContentBaseType(), urlItem.getUrl(), inputStream, null, parserSelector.getWebCrawlerDefaultParser(), parserSelector.getFileCrawlerDefaultParser()); if (parser == null) { urlItem.setParserStatus(ParserStatus.NOPARSER); return; } if (parser.getError() != null) { urlItem.setParserStatus(ParserStatus.PARSER_ERROR); return; } urlItem.clearInLinks(); urlItem.clearOutLinks(); for (ParserResultItem result : parser.getParserResults()) { urlItem.addInLinks(result.getFieldContent(ParserFieldEnum.internal_link)); urlItem.addInLinks(result.getFieldContent(ParserFieldEnum.internal_link_nofollow)); urlItem.addOutLinks(result.getFieldContent(ParserFieldEnum.external_link)); urlItem.addOutLinks(result.getFieldContent(ParserFieldEnum.external_link_nofollow)); urlItem.setLang(result.getFieldValue(ParserFieldEnum.lang, 0)); urlItem.setLangMethod(result.getFieldValue(ParserFieldEnum.lang_method, 0)); urlItem.setContentTypeCharset(result.getFieldValue(ParserFieldEnum.charset, 0)); } ParserStatus parsedStatus = ParserStatus.PARSED; if (parser instanceof HtmlParser) if (!((HtmlParser) parser).isCanonical()) parsedStatus = ParserStatus.PARSED_NON_CANONICAL; urlItem.setParserStatus(parsedStatus); String oldMd5size = urlItem.getMd5size(); String newMd5size = parser.getMd5size(); urlItem.setMd5size(newMd5size); Date oldContentUpdateDate = urlItem.getContentUpdateDate(); Date newContentUpdateDate = null; if (oldContentUpdateDate == null) newContentUpdateDate = parserStartDate; else { if (oldMd5size != null && newMd5size != null) if (!oldMd5size.equals(newMd5size)) newContentUpdateDate = parserStartDate; } if (newContentUpdateDate != null) urlItem.setContentUpdateDate(newContentUpdateDate); for (ParserResultItem result : parser.getParserResults()) { FieldContent fieldContent = result.getFieldContent(ParserFieldEnum.meta_robots); if (fieldContent != null) { List<FieldValueItem> fieldValues = fieldContent.getValues(); if (fieldValues != null) { for (FieldValueItem item : result.getFieldContent(ParserFieldEnum.meta_robots).getValues()) if ("noindex".equalsIgnoreCase(item.getValue())) { urlItem.setIndexStatus(IndexStatus.META_NOINDEX); break; } } } } }