@Override
  public String push(final Request entry, CrawlProfile profile, final RobotsTxt robots)
      throws IOException, SpaceExceededException {
    assert entry != null;
    final byte[] hash = entry.url().hash();
    synchronized (this) {
      // double-check
      if (this.has(hash)) return "double occurrence in urlFileIndex";

      // increase dom counter
      if (profile != null) {
        int maxPages = profile.domMaxPages();
        if (maxPages != Integer.MAX_VALUE && maxPages > 0) {
          String host = entry.url().getHost();
          profile.domInc(host);
        }
      }

      // add to index
      Index depthStack = getStack(entry.depth());
      final int s = depthStack.size();
      depthStack.put(entry.toRow());
      assert s < depthStack.size()
          : "hash = " + ASCII.String(hash) + ", s = " + s + ", size = " + depthStack.size();
      assert depthStack.has(hash) : "hash = " + ASCII.String(hash);
    }
    return null;
  }
Ejemplo n.º 2
0
  public static serverObjects respond(
      @SuppressWarnings("unused") final RequestHeader header,
      final serverObjects post,
      final serverSwitch env) {
    // return variable that accumulates replacements
    final Switchboard sb = (Switchboard) env;
    final serverObjects prop = new serverObjects();
    final Segment segment = sb.index;
    final SolrConnector connector = segment.fulltext().getDefaultConnector();

    // avoid UNRESOLVED PATTERN
    prop.put("url", "");
    prop.put("citations", 0);
    prop.put("sentences", 0);

    DigestURL uri = null;
    String url = "";
    String hash = "";
    int ch = 10;
    boolean filter = false; // show cited sentences only
    if (post != null) {
      if (post.containsKey("url")) {
        url = post.get("url");
        if (!url.startsWith("http://")
            && !url.startsWith("https://")
            && !url.startsWith("ftp://")
            && !url.startsWith("smb://")
            && !url.startsWith("file://")) {
          url = "http://" + url;
        }
      }
      if (post.containsKey("hash")) {
        hash = post.get("hash");
      }
      if (post.containsKey("ch")) {
        ch = post.getInt("ch", ch);
      }
      filter = post.getBoolean("filter");
    }
    prop.put("filter", filter);
    if (url.length() > 0) {
      try {
        uri = new DigestURL(url, null);
        hash = ASCII.String(uri.hash());
      } catch (final MalformedURLException e) {
      }
    }
    if (uri == null && hash.length() > 0) {
      try {
        uri = sb.getURL(ASCII.getBytes(hash));
        if (uri == null) {
          connector.commit(true); // try again, that url can be fresh
          uri = sb.getURL(ASCII.getBytes(hash));
        }
      } catch (IOException e) {
        ConcurrentLog.logException(e);
      }
    }
    if (uri == null) return prop; // no proper url addressed
    url = uri.toNormalform(true);
    prop.put("url", url);

    // get the document from the index
    SolrDocument doc;
    try {
      doc =
          segment
              .fulltext()
              .getDefaultConnector()
              .getDocumentById(
                  hash,
                  CollectionSchema.title.getSolrFieldName(),
                  CollectionSchema.text_t.getSolrFieldName());
    } catch (final IOException e1) {
      return prop;
    }
    @SuppressWarnings("unchecked")
    ArrayList<String> title =
        (ArrayList<String>) doc.getFieldValue(CollectionSchema.title.getSolrFieldName());
    String text = (String) doc.getFieldValue(CollectionSchema.text_t.getSolrFieldName());

    ArrayList<String> sentences = new ArrayList<String>();
    if (title != null) for (String s : title) if (s.length() > 0) sentences.add(s);
    if (text != null && !text.isEmpty()) {
      SentenceReader sr = new SentenceReader(text);
      StringBuilder line;
      while (sr.hasNext()) {
        line = sr.next();
        if (line.length() > 0) sentences.add(line.toString());
      }
    }

    // for each line make a statistic about the number of occurrences somewhere else
    OrderedScoreMap<String> scores =
        new OrderedScoreMap<String>(null); // accumulates scores for citating urls
    LinkedHashMap<String, Set<DigestURL>> sentenceOcc = new LinkedHashMap<String, Set<DigestURL>>();
    for (String sentence : sentences) {
      if (sentence == null || sentence.length() < 40) {
        // do not count the very short sentences
        sentenceOcc.put(sentence, null);
        continue;
      }
      try {
        sentence = sentence.replace('"', '\'');
        SolrDocumentList doclist =
            connector.getDocumentListByQuery(
                "text_t:\"" + sentence + "\"",
                CollectionSchema.url_chars_i.getSolrFieldName() + " asc",
                0,
                100,
                CollectionSchema.sku.getSolrFieldName());
        int count = (int) doclist.getNumFound();
        if (count > 0) {
          Set<DigestURL> list = new TreeSet<DigestURL>();
          for (SolrDocument d : doclist) {
            String u = (String) d.getFieldValue(CollectionSchema.sku.getSolrFieldName());
            if (u == null || u.equals(url)) continue;
            scores.inc(u);
            try {
              list.add(new DigestURL(u, null));
            } catch (final MalformedURLException e) {
            }
          }
          sentenceOcc.put(sentence, list);
        }
      } catch (final Throwable ee) {

      }
    }
    sentences.clear(); // we do not need this again

    // iterate the sentences
    int i = 0;
    int sentenceNr = 0;
    for (Map.Entry<String, Set<DigestURL>> se : sentenceOcc.entrySet()) {
      Set<DigestURL> app = se.getValue();
      if (filter) { // prepare list, only include sentence with citation
        if (app != null && app.size() > 0) {
          StringBuilder dd = new StringBuilder(se.getKey());
          prop.put("sentences_" + i + "_dt", sentenceNr);
          dd.append("<br/>appears in:");
          for (DigestURL u : app) {
            if (u != null) {
              dd.append(" <a href=\"")
                  .append(u.toNormalform(false))
                  .append("\">")
                  .append(u.getHost())
                  .append("</a>");
            }
          }
          prop.put("sentences_" + i + "_dd", dd.toString());
          i++;
        }
      } else { // prepare list, include all sentences
        StringBuilder dd = new StringBuilder(se.getKey());
        prop.put("sentences_" + i + "_dt", sentenceNr);
        if (app != null && app.size() > 0) {
          dd.append("<br/>appears in:");
          for (DigestURL u : app) {
            if (u != null) {
              dd.append(" <a href=\"")
                  .append(u.toNormalform(false))
                  .append("\">")
                  .append(u.getHost())
                  .append("</a>");
            }
          }
        }
        prop.put("sentences_" + i + "_dd", dd.toString());
        i++;
      }
      sentenceNr++;
    }
    prop.put("sentences", i);

    // iterate the citations in order of number of citations
    i = 0;
    for (String u : scores.keyList(false)) {
      try {
        DigestURL uu = new DigestURL(u, null);
        prop.put("citations_" + i + "_dt", "<a href=\"" + u + "\">" + u + "</a>");
        StringBuilder dd = new StringBuilder();
        dd.append("makes ")
            .append(Integer.toString(scores.get(u)))
            .append(" citations: of ")
            .append(url);
        for (Map.Entry<String, Set<DigestURL>> se : sentenceOcc.entrySet()) {
          Set<DigestURL> occurls = se.getValue();
          if (occurls != null && occurls.contains(uu))
            dd.append("<br/><a href=\"/solr/select?q=text_t:%22")
                .append(se.getKey().replace('"', '\''))
                .append("%22&rows=100&grep=&wt=grephtml\">")
                .append(se.getKey())
                .append("</a>");
        }
        prop.put("citations_" + i + "_dd", dd.toString());
        i++;
      } catch (final MalformedURLException e) {
      }
    }
    prop.put("citations", i);

    // find similar documents from different hosts
    i = 0;
    for (String u : scores.keyList(false)) {
      if (scores.get(u) < ch) continue;
      try {
        DigestURL uu = new DigestURL(u, null);
        if (uu.getOrganization().equals(uri.getOrganization())) continue;
        prop.put("similar_links_" + i + "_url", u);
        i++;
      } catch (final MalformedURLException e) {
      }
    }
    prop.put("similar_links", i);
    prop.put("similar", i > 0 ? 1 : 0);

    // return rewrite properties
    return prop;
  }