예제 #1
0
public class WordIndex {
  private static Logger log = Logger.getLogger(Dictionary.class.getName());
  private HashMap<String, ArrayList<FilePositions>> idx =
      new HashMap<String, ArrayList<FilePositions>>();
  private Dictionary dict = Dictionary.getInstance();

  class FilePositions {
    long fileid;
    ArrayList<Long> poslist;
  }

  public boolean addWord(String str, long fileid, long pos) {
    /*
     * Filter out certain strings. e.g. words that are either very rare (e.g.
     * spelling mistakes) or extremely common "stop words".
     * http://en.wikipedia.org/wiki/Stop_words
     *
     * Our definition of stop words is completely arbitrary. It has more to do
     * with what kind of computation we can do on a single appengine node.
     *
     * Ideally we would not ignore any words, but that will require sharding the
     * word index across a number of serving machines.
     */
    if (str.length() < 3) return false;
    int freq = dict.getFrequency(str);
    if (freq == 0 || freq > 90000) return false;

    ArrayList<FilePositions> fplist = idx.get(str);
    if (fplist == null) {
      fplist = new ArrayList<FilePositions>();
      idx.put(str, fplist);
    }

    if (fplist.size() == 0 || fplist.get(fplist.size() - 1).fileid != fileid) {
      FilePositions fp = new FilePositions();
      fp.fileid = fileid;
      fp.poslist = new ArrayList<Long>();
      fplist.add(fp);
    }
    fplist.get(fplist.size() - 1).poslist.add(pos);
    return true;
  }

  private void writeMap() {
    for (Map.Entry<String, ArrayList<FilePositions>> e : idx.entrySet()) {
      System.out.print(e.getKey());
      for (FilePositions fp : e.getValue()) {
        System.out.print(" " + fp.fileid);
        for (Long pos : fp.poslist) System.out.print(" " + pos);
        System.out.print(" . ");
      }
      System.out.println();
    }
  }

  public void initFromText(String fname) {
    long count = 0;
    long last = 0;
    long values = 0;

    try {
      DataInputStream is = new DataInputStream(new FileInputStream(fname));
      while (true) {
        long fileid;
        try {
          fileid = is.readLong();
        } catch (EOFException e) {
          break;
        }
        long len = is.readLong();
        byte[] buf = new byte[(int) len];
        is.read(buf);

        String str = new String(buf);
        String[] tokens = str.split("[^a-zA-Z]+");

        int pos = 0;
        for (String t : tokens) {
          t = new String(t.toLowerCase());
          boolean added = addWord(t, fileid, pos++);
          if (added) {
            values++;
          }
        }

        count += len;
        if (count > last + 100 * 1024 * 1024) {
          System.err.printf(
              "processed = %d Mb, # words = %d, # positions = %d, lastfileid = %d\n",
              count / 1024 / 1024, idx.size(), values, fileid);
          last = count;
        }
      }
    } catch (IOException e) {
      log.log(Level.WARNING, "Error: " + fname, e);
    } finally {
    }
  }

  public static void main(String args[]) {
    WordIndex wi = new WordIndex();
    wi.initFromText(args[0]);
    wi.writeMap();
  }
}
 @Before
 public void setup() throws FileNotFoundException {
   Dictionary dict = Dictionary.getInstance(new File("words.txt"));
   BoardDescription board = new WordsWithFriendsBoard();
   game = new PlayingBoard(board, dict);
 }