private void processLink(String markup, StringBuffer context, HashSet<Integer> bannedTopics) {

    // ignore everything that is not in main namespace
    if (markup.indexOf(":") > 0) return;

    String anchor = markup;
    String dest = markup;

    int pos = markup.lastIndexOf("|");
    if (pos > 0) {
      anchor = markup.substring(pos + 1);
      dest = markup.substring(0, pos);
    }

    context.append("\n").append(anchor);

    Article art = wikipedia.getArticleByTitle(dest);
    if (art != null) {
      bannedTopics.add(art.getId());
    }
  }
Esempio n. 2
0
  /**
   * This efficiently identifies sentences within this article that contain links to the given
   * target article. The actual text of these sentences can be obtained using {@link
   * Page#getSentenceMarkup(int)}
   *
   * @param art the article of interest.
   * @return an array of sentence indexes that contain links to the given article.
   */
  public Integer[] getSentenceIndexesMentioning(Article art) {

    DbLinkLocationList tmpLinks = env.getDbPageLinkIn().retrieve(art.getId());
    if (tmpLinks == null || tmpLinks.getLinkLocations() == null) return new Integer[0];

    DbLinkLocation key = new DbLinkLocation(id, null);
    int index =
        Collections.binarySearch(
            tmpLinks.getLinkLocations(),
            key,
            new Comparator<DbLinkLocation>() {
              @Override
              public int compare(DbLinkLocation a, DbLinkLocation b) {
                return new Integer(a.getLinkId()).compareTo(b.getLinkId());
              }
            });

    if (index < 0) return new Integer[0];

    ArrayList<Integer> sentenceIndexes =
        tmpLinks.getLinkLocations().get(index).getSentenceIndexes();

    return sentenceIndexes.toArray(new Integer[sentenceIndexes.size()]);
  }