示例#1
0
  /**
   * Recursively extracts all <a> tags in an HTML document and returns them as a list to be added to
   * the queue to be crawled
   *
   * @param url URL of the document, to evaluate relative links
   * @param parent the parent node of this node
   * @return All links found in this branch
   */
  public void extract(String url, Node parent) throws MalformedURLException {

    NodeList nodes = parent.getChildNodes();
    //		ArrayList<String> links = new ArrayList<String>();

    for (int z = 0; z < nodes.getLength(); z++) {
      Node child = nodes.item(z);

      // Skip bad tags which we can't recognize!
      if (child.getNodeName() == null) continue;

      if (child.getNodeName().equals("a")) {
        // Links
        NamedNodeMap attr = child.getAttributes();

        if (attr.getNamedItem("href") != null) {
          // Found a link
          String link = attr.getNamedItem("href").getNodeValue();

          if (!link.startsWith("http")) link = resolve(docId, link);

          links.add(link);
          format.push("a");
          anchor = true;
        }
      } else if (child.getNodeName().equals("b") || child.getNodeName().equals("strong")) {
        // Bolded
        format.push("b");
        bold = true;
      } else if (child.getNodeName().equals("i") || child.getNodeName().equals("em")) {
        // Italicized
        format.push("i");
        ital = true;
      } else if (child.getNodeName().equals("p")) {
        // paragraph tag
        format.push("p");
        paragraph = true;
      } else if (child.getNodeName().equals("title")) {
        // Title Node
        format.push("title");
        title = true;
      } else if (child.getNodeType() == Node.TEXT_NODE) {
        // Text

        String sentence = child.getNodeValue();

        if (title) {
          docTitle = sentence;
        }

        String sentenceStem = removePunctuation(sentence).toLowerCase();

        String[] stemlist = stemmer.stemList(sentenceStem.split("\\s+"));

        boolean[] formatting = {anchor, ital, bold};

        for (int i = 0; i < stemlist.length; i++) {
          // Create Hits

          //					byte[] wordId = lexicon.getWordId(stemlist[i]);
          //
          //					if (wordId == null) {
          //						lexicon.addNewWord(stemlist[i]);
          //						wordId = lexicon.getWordId(stemlist[i]);
          //					}

          String word = stemlist[i];

          if (Indexer.isInLexicon(word)) {
            Hit hit = new Hit(docId, word, pos + i);

            // Check formatting
            if (ital) hit.setItalicize(true);
            if (bold) hit.setBold(true);

            // Make hitlist
            ArrayList<Hit> list;
            if (hitList.containsValue(word)) {
              list = hitList.get(word);
            } else {
              list = new ArrayList<Hit>();
            }
            list.add(hit);

            // If anchor, make an achor hit too
            if (anchor) {
              // LINK NEEDS TO BE TESTED!
              hit = new AnchorHit(docId, word, i, links.get(links.size() - 1));
              list.add(hit);
              anchorList.add(hit);
            }
            hitList.put(word, list);

            if (blurbs.size() < 4 && paragraph) blurbs.add(sentence);
          }
        }

        pos += stemlist.length;

        text.add(child.getNodeValue());
      } else {
        // Other
        format.push(child.getNodeName());
      }

      // Recursively find more links
      if (child.hasChildNodes()) {
        extract(url, child);
      }
    }

    if (!format.isEmpty()) {
      String tag = format.pop();

      if ("a".equals(tag)) {
        anchor = false;
      } else if ("b".equals(tag)) {
        bold = false;
      } else if ("i".equals(tag)) {
        ital = false;
      } else if ("title".equals(tag)) {
        title = false;
      } else if ("p".equals(tag)) {
        paragraph = false;
      }
    }
  }