public void setGroundMention(ArrayList<XMLTagInfo> groundtruth) throws Exception {
   groundMention.clear();
   keywords.clear();
   for (int i = 0; i < groundtruth.size(); i++) {
     int off = groundtruth.get(i).offset;
     int len = groundtruth.get(i).length;
     groundMention.add(document.substring(off, off + len));
     Mention m = new Mention();
     int context_lo = Math.max(0, off - contextSize);
     int context_hi = Math.min(document.length() - 1, off + contextSize);
     String contextString = document.substring(context_lo, context_hi);
     m.context = contextString.replaceAll("[^0-9a-z\\sA-Z/\\-]", "").toLowerCase();
     int con_lo = Math.max(0, m.offset - 10);
     int con_hi = Math.min(document.length() - 1, m.offset + 10);
     m.contextAroundMention =
         document.substring(con_lo, con_hi).replaceAll("[^0-9a-z\\sA-Z]", " ").toLowerCase();
     m.contextAroundMention += " " + m.name.replaceAll("[^0-9a-z\\sA-Z]", " ").toLowerCase();
     m.name = document.substring(off, off + len);
     m.context.replaceAll("\\sand", "");
     m.contextAroundMention.replaceAll("\\sand", "");
     m.context.replaceAll("\\snot", "");
     m.contextAroundMention.replaceAll("\\snot", "");
     m.length = len;
     m.offset = off;
     keywords.add(m);
   }
 }
 // for collective training as we already have ground mentions
 public void setKeywordsTraining(
     HashMap<String, ArrayList<XMLTagInfo>> groundMapWiki,
     HashMap<String, ArrayList<XMLTagInfo>> groundMapManual,
     String file) {
   ArrayList<XMLTagInfo> mapForTrainFile = groundMapWiki.get(file);
   for (int i = 0; i < mapForTrainFile.size(); i++) {
     Mention mention = new Mention();
     mention.key = mapForTrainFile.get(i).mention;
     mention.name = mapForTrainFile.get(i).mention;
     mention.length = mapForTrainFile.get(i).length;
     mention.offset = mapForTrainFile.get(i).offset;
     if (null == mention.name)
       mention.name = document.substring(mention.offset, mention.offset + mention.length);
     if (mention.offset < document.length() - 1) {
       int context_lo = Math.max(0, (mention.offset) - contextSize);
       int context_hi = Math.min(document.length() - 1, (mention.offset) + contextSize);
       String contextString = document.substring(context_lo, context_hi);
       mention.context = contextString.replaceAll("[^0-9a-z\\sA-Z/\\-]", "");
       int con_lo = Math.max(0, mention.offset - 10);
       int con_hi = Math.min(document.length() - 1, mention.offset + 10);
       mention.contextAroundMention =
           document.substring(con_lo, con_hi).replaceAll("[^0-9a-z\\sA-Z]", " ");
       mention.contextAroundMention += " " + mention.name.replaceAll("[^0-9a-z\\sA-Z]", " ");
       keywords.add(mention);
     }
   }
   if (groundMapManual != null) {
     ArrayList<XMLTagInfo> mapForTrainFile1 = groundMapManual.get(file);
     for (int i = 0; i < mapForTrainFile1.size(); i++) {
       Mention mention = new Mention();
       mention.key = mapForTrainFile1.get(i).mention;
       mention.name = mapForTrainFile1.get(i).mention;
       mention.length = mapForTrainFile1.get(i).mention.length();
       mention.offset = mapForTrainFile1.get(i).offset;
       int context_lo = Math.max(0, mention.offset - contextSize);
       int context_hi = Math.min(document.length() - 1, mention.offset + contextSize);
       String contextString = document.substring(context_lo, context_hi);
       mention.context = contextString.replaceAll("[^0-9a-z\\sA-Z/\\-]", "");
       int con_lo = Math.max(0, mention.offset - 10);
       int con_hi = Math.min(document.length() - 1, mention.offset + 10);
       mention.contextAroundMention =
           document.substring(con_lo, con_hi).replaceAll("[^0-9a-z\\sA-Z]", " ");
       mention.contextAroundMention += " " + mention.name.replaceAll("[^0-9a-z\\sA-Z]", " ");
       keywords.add(mention);
     }
   }
 }
  public void setKeywordsWikiMiner() {
    try {
      WikipediaAnnotator annotator = new WikipediaAnnotator();
      long annstartTime = System.currentTimeMillis();

      HashMap<String, Label.Sense[]> ment2ent = annotator.annotate(document);

      long annendTime = System.currentTimeMillis();

      long diff1 = (annendTime - annstartTime);

      System.out.println("Time taken by annotater : " + diff1 + " milliseconds");

      for (String key : ment2ent.keySet()) {

        // sunny:adding code to check if sense is within freebase dataset
        // adding it only if we find it.

        //				Vector<Label.Sense> updatedsenses = new Vector();
        //				for(Label.Sense s : ment2ent.get(key)){
        //					String entity = s.getTitle().replace(" ", "_");
        //					String freebaseid =
        // WikiToFreebaseIDMap.getInstance().getFreeBaseID("\"/wikipedia/en_title/" + entity +
        // "\"");
        //
        //					if(freebaseid != null){
        //						updatedsenses.add(s);
        //					}
        //				}
        //
        //				Label.Sense[] sensearray = new Label.Sense[updatedsenses.size()];
        //
        //				updatedsenses.toArray(sensearray);

        LabelSense senses = new LabelSense(ment2ent.get(key));
        // LabelSense senses = new LabelSense(sensearray);
        Mention mention = new Mention();
        // System.out.println("key from ment2ent : " + key);
        String ment = key.split("_")[0];

        // System.out.println("ment from ment2ent : " + ment);
        int off = Integer.parseInt(key.split("_")[1]);
        mention.key = ment;
        mention.name = ment;
        mention.length = ment.length();
        mention.offset = off;
        mention.context = getContext(off, mention.length, contextSize);
        mention.contextAroundMention = getContext(off, mention.length, 10);

        mention.senses = senses;
        keywords.add(mention);
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }
  public void consolidateMentions(int maxLength) {

    if (!Config.Server && thesaurus == null) {
      thesaurus = new Wikisaurus();
    }

    ClientWikisauras obj = new ClientWikisauras();

    //		LuceneIndexWrapper luceneIndex = new LuceneIndexWrapper(
    //				props.getCompleteIndex(), props.getRedirectIndex(),
    //				props.getInlinkIndex(), props.getDisambIndex(),
    //				props.getAnchorIndex());

    ArrayList<Mention> mentions = new ArrayList<Mention>();
    mentions.addAll(keywords);
    // System.out.println("consolidating mentions size:" + mentions.size());
    keywords = new ArrayList<Mention>();
    Integer[] token_type = new Integer[mentions.size()];
    for (int i = 0; i < token_type.length; i++) token_type[i] = 0;

    int curr_offset = 0;
    String curr_mention = "";
    for (int i = 0; i < mentions.size(); i++) {
      if (token_type[i] != 0) {
        // i++;
        continue;
      }
      curr_offset = mentions.get(i).offset;
      curr_mention = mentions.get(i).name;
      // System.out.println("offset: " + curr_offset + " curr_mention: " + curr_mention + " context:
      // " + mentions.get(i).context);

      String[] allWords = new String[maxLength];
      Integer[] allOffset = new Integer[maxLength];
      String currWord = curr_mention;
      Integer currWordEnd = curr_offset + curr_mention.length() + 1;

      allWords[0] = currWord;
      allOffset[0] = curr_offset;
      int k = 1;
      for (; k < maxLength; k++) {
        currWordEnd = document.indexOf(" ", currWordEnd + 1);
        if (currWordEnd == -1) currWordEnd = document.length();
        if (curr_offset < 0 || curr_offset >= document.length()) {
          k--;
          break;
        }
        currWord = document.substring(curr_offset, currWordEnd);
        allWords[k] = currWord;
        allOffset[k] = currWordEnd;
        if (currWordEnd >= document.length()) break;
      }
      if (k == maxLength) k--;

      for (; k >= 0; k--) {
        LabelSense senses = null;
        // System.out.println("allwords[" + k + "] : " + allWords[k]);
        try {
          if (Config.Server) senses = obj.getSenses(allWords[k]);
          else {
            // String possibleMention = WordUtils.capitalize(allWords[k]);
            Label.Sense[] temp = thesaurus.getSenses(allWords[k]);

            //						List<String> qwords = Arrays.asList(allWords[k].split(" "));
            //
            //						boolean nostopword = true;
            //						for(String item : qwords){
            //							if(Stopwords.isStopword(item)){
            //								nostopword = false;
            //							}
            //						}

            if (temp != null) senses = new LabelSense(temp);

            //						Vector<String> sensewmc = new Vector();
            //						Vector<Double> sensewmp = new Vector();

            // hard coded search for the word in the freebase dataset. should not be done for long
            // text hence commenting
            //						if(!Stopwords.isStopword(allWords[k].toLowerCase())){
            //							Vector<String> freebaseTitles =
            // WikiToFreebaseIDMap.getInstance().getAllWikiTitles(allWords[k].toLowerCase());
            //							//String title = "/wikipedia/en_title/" + allWords[k].replace(" ", "_");
            //							//java.util.regex.Pattern pa =
            // java.util.regex.Pattern.compile(title.toLowerCase());
            //							//java.util.regex.Matcher ma = pa.matcher("");
            //
            //							if(freebaseTitles != null){
            //								for(String fbTitle : freebaseTitles){
            //									fbTitle = fbTitle.replace("/wikipedia/en_title/", "");
            //									fbTitle = fbTitle.replace("\"", "");
            //									fbTitle = fbTitle.replace("_", " ");
            //									System.out.println("fbTitle : " + fbTitle);
            //									sensewmc.add(fbTitle);
            //									sensewmp.add(new Double(1.0/(freebaseTitles.size())));
            //									//sensewmp.add(new Double(0));
            //								}
            //							}
            //						}

            //						if((temp != null) || (sensewmc.size() > 0)){
            //							senses = new LabelSense();
            //
            //							int scount= 0,total = 0;
            //
            //							if(temp != null){
            //								total = temp.length + sensewmc.size();
            //								senses.wikiMinerCandidate = new String[temp.length + sensewmc.size()];
            //								senses.wikiMinerProbability = new double[temp.length + sensewmc.size()];
            //
            //								for(;scount<temp.length;++scount){
            //									senses.wikiMinerCandidate[scount] = temp[scount].getTitle();
            //									senses.wikiMinerProbability[scount] = temp[scount].getPriorProbability();
            //								}
            //								scount = temp.length;
            //							}
            //							else{
            //								total = sensewmc.size();
            //								senses.wikiMinerCandidate = new String[sensewmc.size()];
            //								senses.wikiMinerProbability = new double[sensewmc.size()];
            //							}
            //
            //							for(int cnt = 0;scount < total;++scount,++cnt){
            //								senses.wikiMinerCandidate[scount] = sensewmc.elementAt(cnt);
            //								senses.wikiMinerProbability[scount] = sensewmp.elementAt(cnt);
            //							}
            //						}

            // else if((k >= 1 && k <= 2) || ((k == 0) &&
            // (!Stopwords.isStopword(allWords[k].split("_")[0]))))
            //						else if((k <= 2) && (nostopword == true))
            //						{
            //							String myquery = allWords[k];
            //
            //							String query = luceneIndex.buildPhraseSearchQuery(myquery,null);
            //
            //							System.out.println("query : " + query);
            //
            //							if (query != null) {
            //								luceneIndex.searchStringInIndex(query, 2);
            //
            //								System.out.println("found : " + luceneIndex.hits.scoreDocs.length);
            //
            //								Vector<String> sensewmc = new Vector();
            //								Vector<Double> sensewmp = new Vector();
            //
            //								for (int licount = 0;licount < luceneIndex.hits.scoreDocs.length;++licount) {
            //									Document doc =
            // luceneIndex.searcher.doc(luceneIndex.hits.scoreDocs[licount].doc); // get the next
            // document
            //
            //									String pagetitle = doc.get("page_title");
            //									String disamb = doc.get("title_disamb");
            //									if (!((disamb == null) || disamb.equals(""))){
            //										pagetitle = pagetitle + " (" + disamb + ")";
            //									}
            //
            //									System.out.println("lucene hit: " + pagetitle + " score : " +
            // luceneIndex.hits.scoreDocs[licount].score);
            //
            //									//if(luceneIndex.hits.scoreDocs[licount].score < 0.5)
            //									//	continue;
            //
            //									sensewmc.add(pagetitle);
            //									sensewmp.add(new Double(luceneIndex.hits.scoreDocs[licount].score));
            //
            //									if(sensewmc.size() == 3)
            //										break;
            //								}
            //
            //								if(sensewmc.size() > 0){
            //									senses = new LabelSense();
            //
            //									if(sensewmc.size() == 3){
            //										senses.wikiMinerCandidate = new String[3];
            //										senses.wikiMinerProbability = new double[3];
            //									}
            //									else{
            //										senses.wikiMinerCandidate = new String[sensewmc.size()];
            //										senses.wikiMinerProbability = new double[sensewmc.size()];
            //									}
            //								}
            //
            //								for(int scount=0;scount<sensewmc.size();++scount){
            //									senses.wikiMinerCandidate[scount] = sensewmc.elementAt(scount);
            //									senses.wikiMinerProbability[scount] = sensewmp.elementAt(scount);
            //								}
            //							}
            //						}
          }
        } catch (Exception e) {
          e.printStackTrace();
          System.exit(1);
        }
        if (null != senses) {

          //					Vector<String> updatedsensewmc = new Vector();
          //					Vector<Double> updatedsensewmp = new Vector();
          //					for(int x=0;x<senses.wikiMinerCandidate.length;++x){
          //						System.out.println("senses: " + senses.wikiMinerCandidate[x]);
          //						String entity = senses.wikiMinerCandidate[x].replace(" ", "_");
          //						String freebaseid =
          // WikiToFreebaseIDMap.getInstance().getFreeBaseID("\"/wikipedia/en_title/" + entity +
          // "\"");
          //						if(freebaseid != null){
          //							updatedsensewmc.add(senses.wikiMinerCandidate[x]);
          //							updatedsensewmp.add(senses.wikiMinerProbability[x]);
          //						}
          //					}
          //
          //					LabelSense lsense = new LabelSense();
          //
          //					lsense.wikiMinerCandidate = new String[updatedsensewmc.size()];
          //					lsense.wikiMinerProbability = new double[updatedsensewmc.size()];
          //
          //					for(int scount=0;scount<updatedsensewmc.size();++scount){
          //						lsense.wikiMinerCandidate[scount] = updatedsensewmc.elementAt(scount);
          //						lsense.wikiMinerProbability[scount] = updatedsensewmp.elementAt(scount);
          //					}

          Mention new_mention = new Mention();
          new_mention.name = allWords[k];
          new_mention.length = new_mention.name.length();
          new_mention.offset = curr_offset;
          new_mention.context = getContext(curr_offset, new_mention.length, contextSize);
          new_mention.contextAroundMention = getContext(curr_offset, new_mention.length, 10);
          if (k == 0) new_mention.key = mentions.get(i).key;
          // new_mention.senses = lsense;
          new_mention.senses = senses;

          System.out.println("wikiminer candidate for : " + new_mention.name);
          for (int ic = 0; ic < senses.wikiMinerCandidate.length; ++ic) {
            System.out.println(
                "\t" + senses.wikiMinerCandidate[ic] + "  " + senses.wikiMinerProbability[ic]);
          }

          keywords.add(new_mention);
          // System.out.println("new_mention offset + length : " + new_mention.offset + " " +
          // new_mention.length);
          if (!isArticleToken(curr_mention)) {
            for (int j = i;
                j < mentions.size()
                    && mentions.get(j).offset < (new_mention.offset + new_mention.length);
                j++) token_type[j] = 1;
          } else {
            token_type[i] = 2;
          }
          break;
        }
      }

      if (token_type[i] == 0 && !isArticleToken(curr_mention) && isValidToken(curr_mention)) {
        keywords.add(mentions.get(i));
      }
    }
  }
  public void setKeywords(boolean stem) throws Exception {
    tagged_document = tagger.tagString(document);
    ArrayList<String> tokens = new ArrayList<String>();

    // System.out.println("tagged document : " + tagged_document);

    StringTokenizer str = new StringTokenizer(tagged_document);

    while (str.hasMoreTokens()) {
      String token = str.nextToken();
      if (token == null || "".equals(token) || " ".equals(token)) continue;
      if (!Stopwords.isStopword(token.split("_")[0])
          || noun_tags.contains(token.split("_")[1])
          || adj_tags.contains(token.split("_")[1])
          || extra_tags.contains(token.split("_")[1])) tokens.add(token);
      // System.out.println("token : " + token);
      if (!Stopwords.isStopword(token.split("_")[0])) {
        //				System.out.println("token added.");
        tokens.add(token);
      }
    }

    String prev_tag = null; // if previous token was a noun then add n-gram
    // noun clause

    int curr_offset = 0, currbyte = 0;
    for (int i = 0; i < tokens.size(); i++) {
      // System.out.print(" "+tokens.get(i));
      if (tokens.get(i) == null) continue;
      Matcher matcher = pattern.matcher(tokens.get(i));
      matcher.find();
      String word = matcher.group(1);
      String tag = matcher.group(2);

      // System.out.println("word: " + word + " tag: " + tag);

      if (word == null || "".equals(word)) {
        prev_tag = null;
        continue;
      }
      String token = word.replaceAll("[^0-9a-z\\sA-Z/\\-]", "");
      if ("".equals(token) || "/".equals(token)) {
        prev_tag = null;
        continue;
      }
      if (!(noun_tags.contains(tag) || adj_tags.contains(tag) || extra_tags.contains(tag))) {
        prev_tag = null;
        continue;
      }
      Mention mention = new Mention();

      if (tag.equals("JJ")) {
        String temp = TestJAWS.getNounForm(token);
        if (temp != null && !"".equals(temp)) {
          mention.key = temp;
          prev_tag = null;
        } else {
          mention.key = token;
          prev_tag = null;
        }
      } else {
        mention.key = token;
      }

      mention.name = word;
      mention.length = word.length();
      curr_offset = document.indexOf(word, curr_offset);

      mention.offset = curr_offset;
      mention.context = getContext(curr_offset, mention.length, contextSize);
      mention.contextAroundMention = getContext(curr_offset, mention.length, 10);
      // StringTokenizer str1 = new StringTokenizer(contextString);
      // while(str1.hasMoreTokens()){
      // String w=str1.nextToken();
      // if (w == null || "".equals(w) || " ".equals(w)) continue;
      // mention.context.add(w);
      // }
      // parseContext(mention);
      // System.out.println("mention.name : " + mention.name + " offset : " + mention.offset);
      keywords.add(mention);
    }
    // System.out.println("Keywords: "+getMentionNames());
    consolidateMentions(6);
    // consolidateMentions(4);
  }