/** * Recursively extracts all <a> tags in an HTML document and returns them as a list to be added to * the queue to be crawled * * @param url URL of the document, to evaluate relative links * @param parent the parent node of this node * @return All links found in this branch */ public void extract(String url, Node parent) throws MalformedURLException { NodeList nodes = parent.getChildNodes(); // ArrayList<String> links = new ArrayList<String>(); for (int z = 0; z < nodes.getLength(); z++) { Node child = nodes.item(z); // Skip bad tags which we can't recognize! if (child.getNodeName() == null) continue; if (child.getNodeName().equals("a")) { // Links NamedNodeMap attr = child.getAttributes(); if (attr.getNamedItem("href") != null) { // Found a link String link = attr.getNamedItem("href").getNodeValue(); if (!link.startsWith("http")) link = resolve(docId, link); links.add(link); format.push("a"); anchor = true; } } else if (child.getNodeName().equals("b") || child.getNodeName().equals("strong")) { // Bolded format.push("b"); bold = true; } else if (child.getNodeName().equals("i") || child.getNodeName().equals("em")) { // Italicized format.push("i"); ital = true; } else if (child.getNodeName().equals("p")) { // paragraph tag format.push("p"); paragraph = true; } else if (child.getNodeName().equals("title")) { // Title Node format.push("title"); title = true; } else if (child.getNodeType() == Node.TEXT_NODE) { // Text String sentence = child.getNodeValue(); if (title) { docTitle = sentence; } String sentenceStem = removePunctuation(sentence).toLowerCase(); String[] stemlist = stemmer.stemList(sentenceStem.split("\\s+")); boolean[] formatting = {anchor, ital, bold}; for (int i = 0; i < stemlist.length; i++) { // Create Hits // byte[] wordId = lexicon.getWordId(stemlist[i]); // // if (wordId == null) { // lexicon.addNewWord(stemlist[i]); // wordId = lexicon.getWordId(stemlist[i]); // } String word = stemlist[i]; if (Indexer.isInLexicon(word)) { Hit hit = new Hit(docId, word, pos + i); // Check formatting if (ital) hit.setItalicize(true); if (bold) hit.setBold(true); // Make hitlist ArrayList<Hit> list; if (hitList.containsValue(word)) { list = hitList.get(word); } else { list = new ArrayList<Hit>(); } list.add(hit); // If anchor, make an achor hit too if (anchor) { // LINK NEEDS TO BE TESTED! hit = new AnchorHit(docId, word, i, links.get(links.size() - 1)); list.add(hit); anchorList.add(hit); } hitList.put(word, list); if (blurbs.size() < 4 && paragraph) blurbs.add(sentence); } } pos += stemlist.length; text.add(child.getNodeValue()); } else { // Other format.push(child.getNodeName()); } // Recursively find more links if (child.hasChildNodes()) { extract(url, child); } } if (!format.isEmpty()) { String tag = format.pop(); if ("a".equals(tag)) { anchor = false; } else if ("b".equals(tag)) { bold = false; } else if ("i".equals(tag)) { ital = false; } else if ("title".equals(tag)) { title = false; } else if ("p".equals(tag)) { paragraph = false; } } }