private void processLink(String markup, StringBuffer context, HashSet<Integer> bannedTopics) { // ignore everything that is not in main namespace if (markup.indexOf(":") > 0) return; String anchor = markup; String dest = markup; int pos = markup.lastIndexOf("|"); if (pos > 0) { anchor = markup.substring(pos + 1); dest = markup.substring(0, pos); } context.append("\n").append(anchor); Article art = wikipedia.getArticleByTitle(dest); if (art != null) { bannedTopics.add(art.getId()); } }
/** * This efficiently identifies sentences within this article that contain links to the given * target article. The actual text of these sentences can be obtained using {@link * Page#getSentenceMarkup(int)} * * @param art the article of interest. * @return an array of sentence indexes that contain links to the given article. */ public Integer[] getSentenceIndexesMentioning(Article art) { DbLinkLocationList tmpLinks = env.getDbPageLinkIn().retrieve(art.getId()); if (tmpLinks == null || tmpLinks.getLinkLocations() == null) return new Integer[0]; DbLinkLocation key = new DbLinkLocation(id, null); int index = Collections.binarySearch( tmpLinks.getLinkLocations(), key, new Comparator<DbLinkLocation>() { @Override public int compare(DbLinkLocation a, DbLinkLocation b) { return new Integer(a.getLinkId()).compareTo(b.getLinkId()); } }); if (index < 0) return new Integer[0]; ArrayList<Integer> sentenceIndexes = tmpLinks.getLinkLocations().get(index).getSentenceIndexes(); return sentenceIndexes.toArray(new Integer[sentenceIndexes.size()]); }