@Override public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots) throws IOException, SpaceExceededException { assert entry != null; final byte[] hash = entry.url().hash(); synchronized (this) { // double-check if (this.has(hash)) return "double occurrence in urlFileIndex"; // increase dom counter if (profile != null) { int maxPages = profile.domMaxPages(); if (maxPages != Integer.MAX_VALUE && maxPages > 0) { String host = entry.url().getHost(); profile.domInc(host); } } // add to index Index depthStack = getStack(entry.depth()); final int s = depthStack.size(); depthStack.put(entry.toRow()); assert s < depthStack.size() : "hash = " + ASCII.String(hash) + ", s = " + s + ", size = " + depthStack.size(); assert depthStack.has(hash) : "hash = " + ASCII.String(hash); } return null; }
public static serverObjects respond( @SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); final Segment segment = sb.index; final SolrConnector connector = segment.fulltext().getDefaultConnector(); // avoid UNRESOLVED PATTERN prop.put("url", ""); prop.put("citations", 0); prop.put("sentences", 0); DigestURL uri = null; String url = ""; String hash = ""; int ch = 10; boolean filter = false; // show cited sentences only if (post != null) { if (post.containsKey("url")) { url = post.get("url"); if (!url.startsWith("http://") && !url.startsWith("https://") && !url.startsWith("ftp://") && !url.startsWith("smb://") && !url.startsWith("file://")) { url = "http://" + url; } } if (post.containsKey("hash")) { hash = post.get("hash"); } if (post.containsKey("ch")) { ch = post.getInt("ch", ch); } filter = post.getBoolean("filter"); } prop.put("filter", filter); if (url.length() > 0) { try { uri = new DigestURL(url, null); hash = ASCII.String(uri.hash()); } catch (final MalformedURLException e) { } } if (uri == null && hash.length() > 0) { try { uri = sb.getURL(ASCII.getBytes(hash)); if (uri == null) { connector.commit(true); // try again, that url can be fresh uri = sb.getURL(ASCII.getBytes(hash)); } } catch (IOException e) { ConcurrentLog.logException(e); } } if (uri == null) return prop; // no proper url addressed url = uri.toNormalform(true); prop.put("url", url); // get the document from the index SolrDocument doc; try { doc = segment .fulltext() .getDefaultConnector() .getDocumentById( hash, CollectionSchema.title.getSolrFieldName(), CollectionSchema.text_t.getSolrFieldName()); } catch (final IOException e1) { return prop; } @SuppressWarnings("unchecked") ArrayList<String> title = (ArrayList<String>) doc.getFieldValue(CollectionSchema.title.getSolrFieldName()); String text = (String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName()); ArrayList<String> sentences = new ArrayList<String>(); if (title != null) for (String s : title) if (s.length() > 0) sentences.add(s); if (text != null && !text.isEmpty()) { SentenceReader sr = new SentenceReader(text); StringBuilder line; while (sr.hasNext()) { line = sr.next(); if (line.length() > 0) sentences.add(line.toString()); } } // for each line make a statistic about the number of occurrences somewhere else OrderedScoreMap<String> scores = new OrderedScoreMap<String>(null); // accumulates scores for citating urls LinkedHashMap<String, Set<DigestURL>> sentenceOcc = new LinkedHashMap<String, Set<DigestURL>>(); for (String sentence : sentences) { if (sentence == null || sentence.length() < 40) { // do not count the very short sentences sentenceOcc.put(sentence, null); continue; } try { sentence = sentence.replace('"', '\''); SolrDocumentList doclist = connector.getDocumentListByQuery( "text_t:\"" + sentence + "\"", CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100, CollectionSchema.sku.getSolrFieldName()); int count = (int) doclist.getNumFound(); if (count > 0) { Set<DigestURL> list = new TreeSet<DigestURL>(); for (SolrDocument d : doclist) { String u = (String) d.getFieldValue(CollectionSchema.sku.getSolrFieldName()); if (u == null || u.equals(url)) continue; scores.inc(u); try { list.add(new DigestURL(u, null)); } catch (final MalformedURLException e) { } } sentenceOcc.put(sentence, list); } } catch (final Throwable ee) { } } sentences.clear(); // we do not need this again // iterate the sentences int i = 0; int sentenceNr = 0; for (Map.Entry<String, Set<DigestURL>> se : sentenceOcc.entrySet()) { Set<DigestURL> app = se.getValue(); if (filter) { // prepare list, only include sentence with citation if (app != null && app.size() > 0) { StringBuilder dd = new StringBuilder(se.getKey()); prop.put("sentences_" + i + "_dt", sentenceNr); dd.append("<br/>appears in:"); for (DigestURL u : app) { if (u != null) { dd.append(" <a href=\"") .append(u.toNormalform(false)) .append("\">") .append(u.getHost()) .append("</a>"); } } prop.put("sentences_" + i + "_dd", dd.toString()); i++; } } else { // prepare list, include all sentences StringBuilder dd = new StringBuilder(se.getKey()); prop.put("sentences_" + i + "_dt", sentenceNr); if (app != null && app.size() > 0) { dd.append("<br/>appears in:"); for (DigestURL u : app) { if (u != null) { dd.append(" <a href=\"") .append(u.toNormalform(false)) .append("\">") .append(u.getHost()) .append("</a>"); } } } prop.put("sentences_" + i + "_dd", dd.toString()); i++; } sentenceNr++; } prop.put("sentences", i); // iterate the citations in order of number of citations i = 0; for (String u : scores.keyList(false)) { try { DigestURL uu = new DigestURL(u, null); prop.put("citations_" + i + "_dt", "<a href=\"" + u + "\">" + u + "</a>"); StringBuilder dd = new StringBuilder(); dd.append("makes ") .append(Integer.toString(scores.get(u))) .append(" citations: of ") .append(url); for (Map.Entry<String, Set<DigestURL>> se : sentenceOcc.entrySet()) { Set<DigestURL> occurls = se.getValue(); if (occurls != null && occurls.contains(uu)) dd.append("<br/><a href=\"/solr/select?q=text_t:%22") .append(se.getKey().replace('"', '\'')) .append("%22&rows=100&grep=&wt=grephtml\">") .append(se.getKey()) .append("</a>"); } prop.put("citations_" + i + "_dd", dd.toString()); i++; } catch (final MalformedURLException e) { } } prop.put("citations", i); // find similar documents from different hosts i = 0; for (String u : scores.keyList(false)) { if (scores.get(u) < ch) continue; try { DigestURL uu = new DigestURL(u, null); if (uu.getOrganization().equals(uri.getOrganization())) continue; prop.put("similar_links_" + i + "_url", u); i++; } catch (final MalformedURLException e) { } } prop.put("similar_links", i); prop.put("similar", i > 0 ? 1 : 0); // return rewrite properties return prop; }