Пример #1
0
  public double getPossibility(int key, int prev, int cur) {
    double result = 0;

    int curIndex = Utility.binarySearch(cur, symbolTable);
    int prevIndex = Utility.binarySearch(prev, symbolTable);

    TagContext tc = getItem(key);

    // return a lower value, not 0 to prevent data sparse
    if (tc == null
        || curIndex == -1
        || prevIndex == -1
        || tc.getContextArray()[prevIndex][curIndex] == 0
        || tc.getTagFreq()[prevIndex] == 0) return 0.000001;

    int prevCurConFreq = tc.getContextArray()[prevIndex][curIndex];
    int prevFreq = tc.getTagFreq()[prevIndex];

    // 0.9 and 0.1 is a value based experience
    result = 0.9 * prevCurConFreq;
    result /= prevFreq;
    result += 0.1 * prevFreq / tc.getTotalFreq();

    return result;
  }
Пример #2
0
  public int getFreq(int key, int symbol) {
    TagContext tc = getItem(key);
    if (tc == null) return 0;

    int index = Utility.binarySearch(symbol, symbolTable);
    if (index == -1) // error finding the symbol
    return 0;

    // Add the frequency
    int frequency = 0;
    if (tc.getTagFreq() != null) frequency = tc.getTagFreq()[index];
    return frequency;
  }
Пример #3
0
  public boolean load(String fileName, boolean isReset) {
    File file = new File(fileName);
    if (!file.canRead()) return false; // fail while opening the file

    try {

      byte[] b = null;
      DataInputStream in = new DataInputStream(new FileInputStream(file));
      // 读取长度
      tableLen = GFCommon.bytes2int(Utility.readBytes(in, 4), false);
      logger.debug("tableLen:" + tableLen);

      // 读取符号标志
      symbolTable = new int[tableLen];
      for (int i = 0; i < tableLen; i++) {
        b = Utility.readBytes(in, 4);
        symbolTable[i] = GFCommon.bytes2int(b, false);
        logger.debug("symbolTable[" + i + "]:" + symbolTable[i]);
      }

      long fileLen = file.length();
      long curLen = 4 + tableLen * 4;
      while (curLen < fileLen) {
        logger.debug("tagContext:");
        TagContext tc = new TagContext();

        // 读取关键词
        b = Utility.readBytes(in, 4);
        int key = GFCommon.bytes2int(b);
        curLen += 4;
        logger.debug("\tkey:" + key);

        // 读取总词频
        b = Utility.readBytes(in, 4);
        curLen += 4;
        int totalFreq = GFCommon.bytes2int(b, false);
        logger.debug("\ttotalFreq:" + totalFreq);

        // 读取词频
        int[] tagFreq = new int[tableLen];
        for (int i = 0; i < tableLen; i++) {
          b = Utility.readBytes(in, 4);
          curLen += 4;
          tagFreq[i] = GFCommon.bytes2int(b, false);
          logger.debug("\ttagFreq[" + i + "]:" + tagFreq[i]);
        }

        // 读取上下文数组
        int[][] contextArray = new int[tableLen][tableLen];
        for (int i = 0; i < tableLen; i++) {
          String pr = "";
          logger.debug("\tcontextArray[" + i + "]");
          for (int j = 0; j < tableLen; j++) {
            b = Utility.readBytes(in, 4);
            curLen += 4;
            contextArray[i][j] = GFCommon.bytes2int(b, false);
            pr += " " + contextArray[i][j];
          }
          logger.debug("\t\t" + pr);
        }

        tc.setTotalFreq(totalFreq);
        tc.setKey(key);
        tc.setTagFreq(tagFreq);
        tc.setContextArray(contextArray);
        tcList.add(tc);
      }
      in.close();
    } catch (FileNotFoundException e) {
      logger.debug("FileNotFoundException:{}", e);
    } catch (IOException e) {
      logger.debug("IOException:{}", e);
    }
    return true;
  }