Java URLUtil.getHost Exemples

Langage de programmation: Java

Espace de nommage/Pack: org.apache.nutch.util

Class/Type: URLUtil

Méthode/Fonction: getHost

Exemples au hotexamples.com: 2

Java URLUtil.getHost - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de org.apache.nutch.util.URLUtil.getHost extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

chooseRepr(2)

getDomainName(2)

getHost(2)

toASCII(1)

Méthodes fréquemment utilisées

chooseRepr (2)

getDomainName (2)

getHost (2)

toASCII (1)

Associées

IUserInstance

System

RestSearchScrollAction

Outline

Eviction

AnySubType

RouteContainer

Questionari

County

DateDialog

Related in langs

nc_subdivision_form_design (PHP)

ActionList (PHP)

ChuDe (C#)

typeOfQuestion (C#)

gtk_spin_button_get_adjustment (C++)

ARGUMENT_PRESENT (C++)

Set (Go)

Blue (Go)

point_dist (Python)

Lexer (Python)

Exemple #1

0

Afficher le fichier

Fichier : GeneratorReducer.java Projet : vuquangtin/apache-nutch-2.2.2

@Override protected void reduce(SelectorEntry key, Iterable<WebPage> values, Context context) throws IOException, InterruptedException { for (WebPage page : values) { if (count >= limit) { return; } if (maxCount > 0) { String hostordomain; if (byDomain) { hostordomain = URLUtil.getDomainName(key.url); } else { hostordomain = URLUtil.getHost(key.url); } Integer hostCount = hostCountMap.get(hostordomain); if (hostCount == null) { hostCountMap.put(hostordomain, 0); hostCount = 0; } if (hostCount >= maxCount) { return; } hostCountMap.put(hostordomain, hostCount + 1); } Mark.GENERATE_MARK.putMark(page, batchId); page.setBatchId(batchId); try { context.write(TableUtil.reverseUrl(key.url), page); } catch (MalformedURLException e) { context.getCounter("Generator", "MALFORMED_URL").increment(1); continue; } context.getCounter("Generator", "GENERATE_MARK").increment(1); count++; } }

Exemple #2

0

Afficher le fichier

Fichier : FetcherThread.java Projet : Ccccz/nutch

private ParseStatus output( Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) { datum.setStatus(status); datum.setFetchTime(System.currentTimeMillis()); if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus); ParseResult parseResult = null; if (content != null) { Metadata metadata = content.getMetadata(); // store the guessed content type in the crawldatum if (content.getContentType() != null) datum .getMetaData() .put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType())); // add segment to metadata metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); // add score to content metadata so that ParseSegment can pick it up. try { scfilters.passScoreBeforeParsing(key, datum, content); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } /* * Note: Fetcher will only follow meta-redirects coming from the * original URL. */ if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) { if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) { try { parseResult = this.parseUtil.parse(content); } catch (Exception e) { LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); } } if (parseResult == null) { byte[] signature = SignatureFactory.getSignature(conf) .calculate(content, new ParseStatus().getEmptyParse(conf)); datum.setSignature(signature); } } /* * Store status code in content So we can read this value during parsing * (as a separate job) and decide to parse or not. */ content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status)); } try { output.collect(key, new NutchWritable(datum)); if (content != null && storingContent) output.collect(key, new NutchWritable(content)); if (parseResult != null) { for (Entry<Text, Parse> entry : parseResult) { Text url = entry.getKey(); Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); ParseData parseData = parse.getData(); if (!parseStatus.isSuccess()) { LOG.warn("Error parsing: " + key + ": " + parseStatus); parse = parseStatus.getEmptyParse(conf); } // Calculate page signature. For non-parsing fetchers this will // be done in ParseSegment byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse); // Ensure segment name and score are in parseData metadata parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName); parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); // Pass fetch time to content meta parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime())); if (url.equals(key)) datum.setSignature(signature); try { scfilters.passScoreAfterParsing(url, content, parse); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } String origin = null; // collect outlinks for subsequent db update Outlink[] links = parseData.getOutlinks(); int outlinksToStore = Math.min(maxOutlinks, links.length); if (ignoreExternalLinks || ignoreInternalLinks) { URL originURL = new URL(url.toString()); // based on domain? if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) { origin = URLUtil.getDomainName(originURL).toLowerCase(); } // use host else { origin = originURL.getHost().toLowerCase(); } } // used by fetchNode if (fetchNode != null) { fetchNode.setOutlinks(links); fetchNode.setTitle(parseData.getTitle()); FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode); } int validCount = 0; // Process all outlinks, normalize, filter and deduplicate List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore); HashSet<String> outlinks = new HashSet<String>(outlinksToStore); for (int i = 0; i < links.length && validCount < outlinksToStore; i++) { String toUrl = links[i].getToUrl(); toUrl = ParseOutputFormat.filterNormalize( url.toString(), toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFilters, urlExemptionFilters, normalizers); if (toUrl == null) { continue; } validCount++; links[i].setUrl(toUrl); outlinkList.add(links[i]); outlinks.add(toUrl); } // Only process depth N outlinks if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) { reporter.incrCounter("FetcherOutlinks", "outlinks_detected", outlinks.size()); // Counter to limit num outlinks to follow per page int outlinkCounter = 0; // Calculate variable number of outlinks by depth using the // divisor (outlinks = Math.floor(divisor / depth * num.links)) int maxOutlinksByDepth = (int) Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks); String followUrl; // Walk over the outlinks and add as new FetchItem to the queues Iterator<String> iter = outlinks.iterator(); while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) { followUrl = iter.next(); // Check whether we'll follow external outlinks if (outlinksIgnoreExternal) { if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) { continue; } } reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1); // Create new FetchItem with depth incremented FetchItem fit = FetchItem.create( new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1); ((FetchItemQueues) fetchQueues).addFetchItem(fit); outlinkCounter++; } } // Overwrite the outlinks in ParseData with the normalized and // filtered set parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()])); output.collect( url, new NutchWritable( new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical()))); } } } catch (IOException e) { if (LOG.isErrorEnabled()) { LOG.error("fetcher caught:" + e.toString()); } } // return parse status if it exits if (parseResult != null && !parseResult.isEmpty()) { Parse p = parseResult.get(content.getUrl()); if (p != null) { reporter.incrCounter( "ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1); return p.getData().getStatus(); } } return null; }