@Override public void execute() throws BuildException { try { log("extracting " + this.url + " into " + this.outFile); String data = new HttpDocumentGrabber().grab(this.url); DocumentParser parser = new DocumentParser(); parser.setEnclosingTag("section"); String xhtml = parser.parse(this.url, UrlUtils.formatTitle(this.url), data); FileUtils.writeStringToFile(this.outFile, xhtml); } catch (Exception ex) { throw new BuildException(ex); } }
/** * Searches for a searchKey in the tree. Also retrieves most relevant snippet. * * @param searchKey, user typed search string. * @return String, the most relevant snippet. */ public String search(String searchKey) { searchKey = searchKey.toLowerCase(); searchKey = searchKey.trim(); searchTerms = searchKey.split("\\W"); // Split around all non-word characters. ArrayList<Integer> termIndex = new ArrayList<Integer>(); ArrayList<Integer> temp; Snippet mostRelevantSnippet = null; if (searchTerms.length != 0) { // Retrieve all indexes of search terms from trieTree for (int i = 0; i < searchTerms.length; i++) { temp = trieTree.getWordIndexes(searchTerms[i]); if (temp != null) termIndex.addAll(temp); } ArrayList<Snippet> allSnippets = new ArrayList<Snippet>(); // Now extract snippets around each term search result. for (int i = 0; i < termIndex.size(); i++) { allSnippets.add(docParser.getSnippet(termIndex.get(i))); } for (int i = 0; i < allSnippets.size(); i++) System.out.println("Snippet " + i + " : " + allSnippets.get(i)); // Score each snippet and extract most relevant one. mostRelevantSnippet = relevanceEngine.getMostRelevant(allSnippets, searchTerms); } if (mostRelevantSnippet == null) return null; return mostRelevantSnippet.toString(); }
/** * Inserts each word in the document into the Trie Tree. This ensures that the document is scanned * only once and all searches for keywords can be done in near log time. */ private void initializeTree() { Iterator<Word> iter = docParser.getAllWords(); Word temp = null; while (iter.hasNext()) { temp = iter.next(); if (temp != null) trieTree.putWord(temp.getWord(), temp.getStartIndex()); } }