/** * Performs a parse by iterating through a List of preferred {@link Parser}s until a successful * parse is performed and a {@link Parse} object is returned. If the parse is unsuccessful, a * message is logged to the <code>WARNING</code> level, and an empty parse is returned. * * @throws ParserNotFound If there is no suitable parser found. * @throws ParseException If there is an error parsing. */ public Parse parse(String url, WebPage page) throws ParserNotFound, ParseException { Parser[] parsers = null; String contentType = TableUtil.toString(page.getContentType()); parsers = this.parserFactory.getParsers(contentType, url); for (int i = 0; i < parsers.length; i++) { if (LOG.isDebugEnabled()) { LOG.debug("Parsing [" + url + "] with [" + parsers[i] + "]"); } Parse parse = null; if (maxParseTime != -1) parse = runParser(parsers[i], url, page); else parse = parsers[i].getParse(url, page); if (parse != null && ParseStatusUtils.isSuccess(parse.getParseStatus())) { return parse; } } LOG.warn("Unable to successfully parse content " + url + " of type " + contentType); return ParseStatusUtils.getEmptyParse( new ParseException("Unable to successfully parse content"), null); }
/** * Parses given web page and stores parsed content within page. Puts a meta-redirect to outlinks. * * @param key * @param page */ public void process(String key, WebPage page) { String url = TableUtil.unreverseUrl(key); byte status = page.getStatus().byteValue(); if (status != CrawlStatus.STATUS_FETCHED) { if (LOG.isDebugEnabled()) { LOG.debug("Skipping " + url + " as status is: " + CrawlStatus.getName(status)); } return; } Parse parse; try { parse = parse(url, page); } catch (ParserNotFound e) { // do not print stacktrace for the fact that some types are not mapped. LOG.warn("No suitable parser found: " + e.getMessage()); return; } catch (final Exception e) { LOG.warn("Error parsing: " + url + ": " + StringUtils.stringifyException(e)); return; } if (parse == null) { return; } org.apache.nutch.storage.ParseStatus pstatus = parse.getParseStatus(); page.setParseStatus(pstatus); if (ParseStatusUtils.isSuccess(pstatus)) { if (pstatus.getMinorCode() == ParseStatusCodes.SUCCESS_REDIRECT) { String newUrl = ParseStatusUtils.getMessage(pstatus); int refreshTime = Integer.parseInt(ParseStatusUtils.getArg(pstatus, 1)); try { newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); if (newUrl == null) { LOG.warn("redirect normalized to null " + url); return; } try { newUrl = filters.filter(newUrl); } catch (URLFilterException e) { return; } if (newUrl == null) { LOG.warn("redirect filtered to null " + url); return; } } catch (MalformedURLException e) { LOG.warn("malformed url exception parsing redirect " + url); return; } page.getOutlinks().put(new Utf8(newUrl), new Utf8()); page.getMetadata().put(FetcherJob.REDIRECT_DISCOVERED, TableUtil.YES_VAL); if (newUrl == null || newUrl.equals(url)) { String reprUrl = URLUtil.chooseRepr(url, newUrl, refreshTime < FetcherJob.PERM_REFRESH_TIME); if (reprUrl == null) { LOG.warn("reprUrl==null for " + url); return; } else { page.setReprUrl(new Utf8(reprUrl)); } } } else { page.setText(new Utf8(parse.getText())); page.setTitle(new Utf8(parse.getTitle())); ByteBuffer prevSig = page.getSignature(); if (prevSig != null) { page.setPrevSignature(prevSig); } final byte[] signature = sig.calculate(page); page.setSignature(ByteBuffer.wrap(signature)); if (page.getOutlinks() != null) { page.getOutlinks().clear(); } final Outlink[] outlinks = parse.getOutlinks(); int outlinksToStore = Math.min(maxOutlinks, outlinks.length); String fromHost; if (ignoreExternalLinks) { try { fromHost = new URL(url).getHost().toLowerCase(); } catch (final MalformedURLException e) { fromHost = null; } } else { fromHost = null; } int validCount = 0; for (int i = 0; validCount < outlinksToStore && i < outlinks.length; i++) { String toUrl = outlinks[i].getToUrl(); try { toUrl = normalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK); toUrl = filters.filter(toUrl); } catch (MalformedURLException e2) { continue; } catch (URLFilterException e) { continue; } if (toUrl == null) { continue; } Utf8 utf8ToUrl = new Utf8(toUrl); if (page.getOutlinks().get(utf8ToUrl) != null) { // skip duplicate outlinks continue; } String toHost; if (ignoreExternalLinks) { try { toHost = new URL(toUrl).getHost().toLowerCase(); } catch (final MalformedURLException e) { toHost = null; } if (toHost == null || !toHost.equals(fromHost)) { // external links continue; // skip it } } validCount++; page.getOutlinks().put(utf8ToUrl, new Utf8(outlinks[i].getAnchor())); } Utf8 fetchMark = Mark.FETCH_MARK.checkMark(page); if (fetchMark != null) { Mark.PARSE_MARK.putMark(page, fetchMark); } } } }