/** Get cash on hand, divide it by the number of outlinks and apply. */ @Override public void distributeScoreToOutlinks( String fromUrl, WebPage row, Collection<ScoreDatum> scoreData, int allCount) { ByteBuffer cashRaw = row.getMetadata().get(CASH_KEY); if (cashRaw == null) { return; } float cash = Bytes.toFloat(cashRaw.array(), cashRaw.arrayOffset() + cashRaw.position()); if (cash == 0) { return; } // TODO: count filtered vs. all count for outlinks float scoreUnit = cash / allCount; // internal and external score factor float internalScore = scoreUnit * internalScoreFactor; float externalScore = scoreUnit * externalScoreFactor; for (ScoreDatum scoreDatum : scoreData) { try { String toHost = new URL(scoreDatum.getUrl()).getHost(); String fromHost = new URL(fromUrl.toString()).getHost(); if (toHost.equalsIgnoreCase(fromHost)) { scoreDatum.setScore(internalScore); } else { scoreDatum.setScore(externalScore); } } catch (MalformedURLException e) { LOG.error("Failed with the following MalformedURLException: ", e); scoreDatum.setScore(externalScore); } } // reset cash to zero row.getMetadata().put(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(0.0f))); }
/** Increase the score by a sum of inlinked scores. */ @Override public void updateScore(String url, WebPage page, List<ScoreDatum> inlinkedScoreData) { float score = page.getScore(); for (ScoreDatum scoreDatum : inlinkedScoreData) { LOG.trace("adding <" + scoreDatum.getUrl() + ", " + scoreDatum.getScore() + ">"); score += scoreDatum.getScore(); } LOG.trace(url + ": " + score + " (" + page.getScore() + ")"); page.setScore(score); ByteBuffer cashRaw = page.getMetadata().get(CASH_KEY); float cash = 1.0f; if (cashRaw != null) { cash = Bytes.toFloat(cashRaw.array(), cashRaw.arrayOffset() + cashRaw.position()); } page.getMetadata().put(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(cash + score))); }
/** * The {@link RelTagIndexingFilter} filter object. * * @param doc The {@link NutchDocument} object * @param url URL to be filtered for rel-tag's * @param page {@link WebPage} object relative to the URL * @return filtered NutchDocument */ @Override public NutchDocument filter(NutchDocument doc, String url, WebPage page) throws IndexingException { // Check if some Rel-Tags found, possibly put there by RelTagParser ByteBuffer bb = page.getFromMetadata(new Utf8(RelTagParser.REL_TAG)); if (bb != null) { String[] tags = Bytes.toString(bb).split("\t"); for (int i = 0; i < tags.length; i++) { doc.add("tag", tags[i]); } } return doc; }
/** * Set to 1.0f. The initial value should equal the injected value, and it should (obviously) be * non-zero. */ @Override public void initialScore(String url, WebPage row) throws ScoringFilterException { row.setScore(1.0f); row.getMetadata().put(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(1.0f))); }
@Override public void injectedScore(String url, WebPage row) throws ScoringFilterException { float score = row.getScore(); row.getMetadata().put(CASH_KEY, ByteBuffer.wrap(Bytes.toBytes(score))); }
public int run(String[] args) throws Exception { boolean dumpText = false; boolean force = false; String contentType = null; String url = null; String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url"; if (args.length == 0) { LOG.error(usage); return (-1); } for (int i = 0; i < args.length; i++) { if (args[i].equals("-forceAs")) { force = true; contentType = args[++i]; } else if (args[i].equals("-dumpText")) { dumpText = true; } else if (i != args.length - 1) { LOG.error(usage); System.exit(-1); } else { url = URLUtil.toASCII(args[i]); } } if (LOG.isInfoEnabled()) { LOG.info("fetching: " + url); } ProtocolFactory factory = new ProtocolFactory(conf); Protocol protocol = factory.getProtocol(url); WebPage page = WebPage.newBuilder().build(); ProtocolOutput protocolOutput = protocol.getProtocolOutput(url, page); if (!protocolOutput.getStatus().isSuccess()) { LOG.error( "Fetch failed with protocol status: " + ProtocolStatusUtils.getName(protocolOutput.getStatus().getCode()) + ": " + ProtocolStatusUtils.getMessage(protocolOutput.getStatus())); return (-1); } Content content = protocolOutput.getContent(); if (content == null) { LOG.error("No content for " + url); return (-1); } page.setBaseUrl(new org.apache.avro.util.Utf8(url)); page.setContent(ByteBuffer.wrap(content.getContent())); if (force) { content.setContentType(contentType); } else { contentType = content.getContentType(); } if (contentType == null) { LOG.error("Failed to determine content type!"); return (-1); } page.setContentType(new Utf8(contentType)); if (ParserJob.isTruncated(url, page)) { LOG.warn("Content is truncated, parse may fail!"); } Parse parse = new ParseUtil(conf).parse(url, page); if (parse == null) { LOG.error("Problem with parse - check log"); return (-1); } // Calculate the signature byte[] signature = SignatureFactory.getSignature(getConf()).calculate(page); if (LOG.isInfoEnabled()) { LOG.info("parsing: " + url); LOG.info("contentType: " + contentType); LOG.info("signature: " + StringUtil.toHexString(signature)); } LOG.info("---------\nUrl\n---------------\n"); System.out.print(url + "\n"); LOG.info("---------\nMetadata\n---------\n"); Map<CharSequence, ByteBuffer> metadata = page.getMetadata(); StringBuffer sb = new StringBuffer(); if (metadata != null) { Iterator<Entry<CharSequence, ByteBuffer>> iterator = metadata.entrySet().iterator(); while (iterator.hasNext()) { Entry<CharSequence, ByteBuffer> entry = iterator.next(); sb.append(entry.getKey().toString()) .append(" : \t") .append(Bytes.toString(entry.getValue())) .append("\n"); } System.out.print(sb.toString()); } LOG.info("---------\nOutlinks\n---------\n"); sb = new StringBuffer(); for (Outlink l : parse.getOutlinks()) { sb.append(" outlink: ").append(l).append('\n'); } System.out.print(sb.toString()); if (page.getHeaders() != null) { LOG.info("---------\nHeaders\n---------\n"); Map<CharSequence, CharSequence> headers = page.getHeaders(); StringBuffer headersb = new StringBuffer(); if (metadata != null) { Iterator<Entry<CharSequence, CharSequence>> iterator = headers.entrySet().iterator(); while (iterator.hasNext()) { Entry<CharSequence, CharSequence> entry = iterator.next(); headersb .append(entry.getKey().toString()) .append(" : \t") .append(entry.getValue()) .append("\n"); } System.out.print(headersb.toString()); } } if (dumpText) { LOG.info("---------\nParseText\n---------\n"); System.out.print(parse.getText()); } return 0; }