예제 #1
0
  /**
   * 从一个词的词性到另一个词的词的分数
   *
   * @param form 前面的词
   * @param to 后面的词
   * @return 分数
   */
  public static double compuScore(Term from, Term to) {
    double frequency = from.termNatures().allFreq + 1;

    if (frequency < 0) {
      double score = from.score() + MAX_FREQUENCE;
      from.score(score);
      return score;
    }

    int nTwoWordsFreq = NgramLibrary.getTwoWordFreq(from, to);
    double value =
        -Math.log(
            dSmoothingPara * frequency / (MAX_FREQUENCE + 80000)
                + (1 - dSmoothingPara) * ((1 - dTemp) * nTwoWordsFreq / frequency + dTemp));

    if (value < 0) {
      value += frequency;
    }
    return from.score() + value;
  }
  /**
   * 人名识别
   *
   * @param term
   * @param offe
   * @param freq
   */
  private Term nameFind(int offe, int beginFreq, int size) {
    // TODO Auto-generated method stub
    StringBuilder sb = new StringBuilder();
    int undefinite = 0;
    skip = false;
    PersonNatureAttr pna = null;
    int index = 0;
    int freq = 0;
    double allFreq = 0;
    Term term = null;
    int i = offe;
    for (; i < terms.length; i++) {
      // 走到结尾处识别出来一个名字.
      if (terms[i] == null) {
        continue;
      }
      term = terms[i];
      pna = term.getTermNatures().personAttr;
      // 在这个长度的这个位置的词频,如果没有可能就干掉,跳出循环
      if ((freq = pna.getFreq(size, index)) == 0) {
        return null;
      }

      if (pna.allFreq > 0) {
        undefinite++;
      }
      sb.append(term.getName());
      allFreq += Math.log(term.getTermNatures().allFreq + 1);
      allFreq += -Math.log((freq));
      index++;

      if (index == size + 2) {
        break;
      }
    }

    double score = -Math.log(FACTORY[size]);
    score += allFreq;
    double endFreq = 0;
    // 开始寻找结尾词
    boolean flag = true;
    while (flag) {
      i++;
      if (i >= terms.length) {
        endFreq = 10;
        flag = false;
      } else if (terms[i] != null) {
        int twoWordFreq = NgramLibrary.getTwoWordFreq(term, terms[i]);
        if (twoWordFreq > 3) {
          return null;
        }
        endFreq = terms[i].getTermNatures().personAttr.end + 1;
        flag = false;
      }
    }

    score -= Math.log(endFreq);
    score -= Math.log(beginFreq);

    if (score > -3) {
      return null;
    }

    if (allFreq > 0 && undefinite > 0) {
      return null;
    }

    skip = undefinite == 0;

    term = new Term(sb.toString(), offe, TermNatures.NR);
    term.selfScore = score;

    return term;
  }