예제 #1
0
 public void test() {
   List<Term> terms = ToAnalysis.parse(line);
   for (Term term : terms) {
     String name = term.getName();
     System.out.println(name);
   }
 }
예제 #2
0
  /*
   * 修正词性
   */
  public static List<Term> updateNature(List<Term> all, Forest... forests) {

    if (forests == null) {
      if (UserDefineLibrary.FOREST != null) {
        forests = new Forest[] {UserDefineLibrary.FOREST};
      } else {
        return all;
      }
    }

    List<Term> result = new ArrayList<Term>();

    for (Term term : all) {

      // 添加对正则停用词的支持
      if ((stopwordPattern != null) && stopwordPattern.matcher(term.getName()).matches()) {
        continue;
      }
      for (Forest forest : forests) {
        String[] params = UserDefineLibrary.getParams(forest, term.getName());
        if (params != null) {
          term.setNature(new Nature(params[0]));
        }
      }
      result.add(term);
    }

    return result;
  }
  @Override
  public EmotionResponse emotionJudge(EmotionRequest req) throws TException {
    long s1 = System.currentTimeMillis();
    String text = req.getText();
    List<Term> terms = ToAnalysis.parse(text);
    List<String> words = new ArrayList<>();
    for (Term term : terms) {
      words.add(term.getName());
    }
    //        logger.info("{} is cut into {}", text, words);

    double score = fbsc.classify(words);
    EmotionResponse response = new EmotionResponse();
    response.setReq(req);
    response.setScore(score);
    int emotion = 0;
    if (score > 0.5) {
      emotion = 1;
    } else if (score < -0.5) {
      emotion = 2;
    } else {
      emotion = 3;
    }
    response.setEmotion(emotion);
    long s2 = System.currentTimeMillis();
    logger.info("emotion: {}, used:{}ms", emotion, (s2 - s1));
    return response;
  }
예제 #4
0
  public List<String> cut(String text) {
    List<Term> terms = ToAnalysis.parse(text);
    List<String> array = new ArrayList<>();
    StringBuilder sbDebug = new StringBuilder();
    for (Term term : terms) {
      String word = term.getName().trim();
      if (word.length() == 0) {
        continue;
      }
      //            System.out.println(word);
      if (filterSingleWord) { // 过滤单字
        if (word.length() < 2) {
          continue;
        }
      }
      if (filterStopWords) {
        if (stopWordsSet.contains(word)) {
          continue;
        }
      }
      Matcher matcher = pureNum.matcher(word); // 含数字的词都不要
      if (matcher.find()) {
        continue;
      }
      array.add(word);

      if (debug) {
        sbDebug.append(word + ",");
      }
    }
    if (debug) {
      logger.info("cut {} into {}", text, sbDebug.toString());
    }
    return array;
  }
예제 #5
0
  @Test
  public void test() {
    String newWord = "爸爸去哪儿";
    String nature = "aaaaa";
    String str = "上海电力2012年财务报表如下怎爸爸去哪儿么办";

    // 增加新词
    UserDefineLibrary.insertWord(newWord, nature, 1000);

    List<Term> parse = ToAnalysis.parse(str);
    HashMap<String, Term> hs = new HashMap<String, Term>();
    for (Term term : parse) {
      hs.put(term.getName(), term);
    }

    Assert.assertTrue(hs.containsKey(newWord));

    Assert.assertEquals(hs.get(newWord).natrue().natureStr, nature);

    // 删除词
    UserDefineLibrary.removeWord(newWord);
    parse = ToAnalysis.parse(str);
    hs = new HashMap<String, Term>();
    for (Term term : parse) {
      hs.put(term.getName(), term);
    }

    Assert.assertTrue(!hs.containsKey(newWord));
  }
 public List<NewWord> getNewWords() {
   // TODO Auto-generated method stub
   List<NewWord> all = new ArrayList<NewWord>();
   List<Term> termList = recogntion_();
   for (Term term2 : termList) {
     all.add(new NewWord(term2.getName(), TermNatures.NR, term2.selfScore, 1));
   }
   return all;
 }
예제 #7
0
 @Override
 public List<ParseTerm> parse(String sentence) {
   // TODO Auto-generated method stub
   List<ParseTerm> result = new ArrayList<ParseTerm>();
   List<Term> list = ToAnalysis.parse(sentence);
   for (Iterator<Term> it = list.iterator(); it.hasNext(); ) {
     Term t = it.next();
     result.add(new ParseTerm(t.getRealName(), t.getNatureStr()));
   }
   return result;
 }
 private void makeNewTerm() {
   Term term = new Term(sb.toString(), offe, tempNature.natureStr, 1);
   term.selfScore = score;
   term.setNature(tempNature);
   if (sb.length() > 3) {
     term.setSubTerm(TermUtil.getSubTerm(from, to));
   }
   TermUtil.termLink(from, term);
   TermUtil.termLink(term, to);
   TermUtil.insertTerm(terms, term);
   TermUtil.parseNature(term);
 }
예제 #9
0
 public String segAnsi(String text) {
   String segs = "";
   List<Term> parse2 = ToAnalysis.parse(text);
   StringBuilder sb = new StringBuilder();
   for (Term term : parse2) {
     sb.append(term.getName());
     sb.append(" ");
   }
   segs = sb.toString();
   segs = segs.trim();
   return segs;
 }
예제 #10
0
  /**
   * 查找两个词与词之间的频率
   *
   * @param from
   * @param to
   * @return
   */
  public static int getTwoWordFreq(Term from, Term to) {
    if (from.getTermNatures().id < 0) {
      return 0;
    }
    BigramEntry[] be = bigramTables[from.getTermNatures().id];

    int index = binarySearch(be, to.getTermNatures().id);

    if (index < 0) {
      return 0;
    }
    return be[index].freq;
  }
예제 #11
0
  @Override
  public void run() {
    String[] strings = line.split("\t");

    if (strings.length == 8) {
      int hfd = Integer.parseInt(strings[3]);
      if (hfd > 1) {
        String content = strings[7];
        List<Term> terms = ToAnalysis.parse(content);
        for (Term term : terms) {
          String name = term.getName();
          System.out.println(name);
        }
      }
    }
  }
예제 #12
0
  public void recognition(Result result) {
    List<Term> terms = result.getTerms();
    String end = null;
    String name;

    LinkedList<Term> mergeList = null;

    List<Term> list = new LinkedList<Term>();

    for (Term term : terms) {
      name = term.getName();
      if (end == null) {
        if ((end = ruleMap.get(name)) != null) {
          mergeList = new LinkedList<Term>();
          mergeList.add(term);
        } else {
          list.add(term);
        }
      } else {
        mergeList.add(term);
        if (end.equals(name)) {

          Term ft = mergeList.pollFirst();
          for (Term sub : mergeList) {
            ft.merage(sub);
          }
          ft.setNature(nature);
          list.add(ft);
          mergeList = null;
          end = null;
        }
      }
    }

    if (mergeList != null) {
      for (Term term : list) {
        list.add(term);
      }
    }

    result.setTerms(list);
  }
예제 #13
0
  /**
   * 从一个词的词性到另一个词的词的分数
   *
   * @param form 前面的词
   * @param to 后面的词
   * @return 分数
   */
  public static double compuScore(Term from, Term to) {
    double frequency = from.getTermNatures().allFreq + 1;

    if (frequency < 0) {
      return from.getScore() + MAX_FREQUENCE;
    }

    int nTwoWordsFreq = TwoWordLibrary.getTwoWordFreq(from, to);
    double value =
        -Math.log(
            dSmoothingPara * frequency / (MAX_FREQUENCE + 80000)
                + (1 - dSmoothingPara) * ((1 - dTemp) * nTwoWordsFreq / frequency + dTemp));

    if (value < 0) value += frequency;

    if (value < 0) {
      value += frequency;
    }
    return from.getScore() + value;
  }
예제 #14
0
 /*
  * 停用词过滤并且修正词性
  */
 public static List<Term> modifResult(List<Term> all) {
   List<Term> result = new ArrayList<Term>();
   try {
     for (Term term : all) {
       if (FILTER.size() > 0
           && (FILTER.contains(term.getName())
               || (isTag && FILTER.contains(TAG + term.natrue().natureStr)))) {
         continue;
       }
       // 添加对正则停用词的支持
       if ((stopwordPattern != null) && stopwordPattern.matcher(term.getName()).matches()) {
         continue;
       }
       String[] params = UserDefineLibrary.getParams(term.getName());
       if (params != null) {
         term.setNature(new Nature(params[0]));
       }
       result.add(term);
     }
   } catch (Exception e) {
     MyStaticValue.LIBRARYLOG.warn(
         "FilterStopWord.updateDic can not be null , "
             + "you must use set FilterStopWord.setUpdateDic(map) or use method set map");
   }
   return result;
 }
예제 #15
0
 public static void main(String[] args) {
   Term from = new Term("阿", 0, new TermNatures(TermNature.NULL));
   from.getTermNatures().id = InitDictionary.getWordId(from.getName());
   Term to = new Term("全国", 0, new TermNatures(TermNature.NULL));
   to.getTermNatures().id = InitDictionary.getWordId(to.getName());
   System.out.println(getTwoWordFreq(from, to));
 }
 /*
  * 停用词过滤并且修正词性
  */
 public static List<Term> modifResult(List<Term> all) {
   List<Term> result = new ArrayList<Term>();
   try {
     for (Term term : all) {
       if (FILTER.size() > 0
           && (FILTER.contains(term.getName())
               || FILTER.contains(TAG + term.getNatrue().natureStr))) {
         continue;
       }
       String[] params = UserDefineLibrary.getParams(term.getName());
       if (params != null) {
         term.setNature(new Nature(params[0]));
       }
       result.add(term);
     }
   } catch (Exception e) {
     System.err.println(
         "FilterStopWord.updateDic can not be null , "
             + "you must use set FilterStopWord.setUpdateDic(map) or use method set map");
   }
   return result;
 }
 private List<Term> recogntion_() {
   Term term = null;
   Term tempTerm = null;
   List<Term> termList = new ArrayList<Term>();
   int beginFreq = 10;
   for (int i = 0; i < terms.length; i++) {
     term = terms[i];
     if (term == null || !term.getTermNatures().personAttr.flag) {
       continue;
     }
     term.score = 0;
     term.selfScore = 0;
     int freq = 0;
     for (int j = 2; j > -1; j--) {
       freq = term.getTermNatures().personAttr.getFreq(j, 0);
       if ((freq > 10) || (term.getName().length() == 2 && freq > 10)) {
         tempTerm = nameFind(i, beginFreq, j);
         if (tempTerm != null) {
           termList.add(tempTerm);
           // 如果是无争议性识别
           if (skip) {
             for (int j2 = i; j2 < tempTerm.getToValue(); j2++) {
               if (terms[j2] != null) {
                 terms[j2].score = 0;
                 terms[j2].selfScore = 0;
               }
             }
             i = tempTerm.getToValue() - 1;
             break;
           }
         }
       }
     }
     beginFreq = term.getTermNatures().personAttr.begin + 1;
   }
   return termList;
 }
예제 #18
0
  /**
   * 新词熵及其左右熵
   *
   * @param all
   */
  public static double leftRightEntropy(List<Term> all) {
    // TODO Auto-generated method stub
    double score = 0;
    NewWordNatureAttr newWordAttr = null;
    Term first = all.get(0);

    // 查看左右链接
    int twoWordFreq = TwoWordLibrary.getTwoWordFreq(first.getFrom(), first);
    score -= twoWordFreq;

    // 查看右连接
    int length = all.size() - 1;
    Term end = all.get(all.size() - 1);
    twoWordFreq = TwoWordLibrary.getTwoWordFreq(end, end.getTo());
    score -= twoWordFreq;

    // 查看内部链接
    for (int i = 0; i < length; i++) {
      score -= TwoWordLibrary.getTwoWordFreq(all.get(i), all.get(i + 1));
    }
    if (score < -3) {
      return 0;
    }

    // 首字分数
    newWordAttr = first.getTermNatures().newWordAttr;
    score += getTermScore(newWordAttr, newWordAttr.getB());
    // 末字分数
    newWordAttr = end.getTermNatures().newWordAttr;
    score += getTermScore(newWordAttr, newWordAttr.getE());
    // 中词分数
    double midelScore = 0;
    Term term = null;
    for (int i = 1; i < length; i++) {
      term = all.get(i);
      newWordAttr = term.getTermNatures().newWordAttr;
      midelScore += getTermScore(newWordAttr, newWordAttr.getM());
    }
    score += midelScore / (length);
    return score;
  }
  @Override
  public HashMap<String, TermScore> tokenizeTerm(String input_str) {
    tokens = ToAnalysis.parse(input_str);
    token_iterator = tokens.listIterator();

    HashMap<String, TermScore> hash = new HashMap<String, TermScore>();
    while (token_iterator.hasNext()) {
      Term term = token_iterator.next();
      if (hash.get(term.getName()) == null)
        hash.put(term.getName(), new TermScore(term.getName(), 0));
      else {
        TermScore exist_term = hash.get(term.getName());
        int new_score = exist_term.getScore() + 1;
        exist_term.setScore(new_score);
        hash.put(term.getName(), exist_term);
      }
    }

    return hash;
  }
예제 #20
0
  /** 人名消歧,比如.邓颖超生前->邓颖 超生 前 fix to 丁颖超 生 前! 规则的方式增加如果两个人名之间连接是- , ·,•则连接 */
  public static void nameAmbiguity(Term[] terms) {
    Term from = null;
    Term term = null;
    Term next = null;
    for (int i = 0; i < terms.length - 1; i++) {
      term = terms[i];
      if (term != null && term.termNatures() == TermNatures.NR && term.getName().length() == 2) {
        next = terms[i + 2];
        if (next.termNatures().personAttr.split > 0) {
          term.setName(term.getName() + next.getName().charAt(0));
          terms[i + 2] = null;
          terms[i + 3] = new Term(next.getName().substring(1), next.getOffe(), TermNatures.NW);
          TermUtil.termLink(term, terms[i + 3]);
          TermUtil.termLink(terms[i + 3], next.to());
        }
      }
    }

    // 外国人名修正
    for (int i = 0; i < terms.length; i++) {
      term = terms[i];
      if (term != null
          && term.getName().length() == 1
          && i > 0
          && WordAlert.CharCover(term.getName().charAt(0)) == '·') {
        from = term.from();
        next = term.to();

        if (from.natrue().natureStr.startsWith("nr") && next.natrue().natureStr.startsWith("nr")) {
          from.setName(from.getName() + term.getName() + next.getName());
          TermUtil.termLink(from, next.to());
          terms[i] = null;
          terms[i + 1] = null;
        }
      }
    }
  }
예제 #21
0
 /**
  * 词性词频词长.计算出来一个分数
  *
  * @param from
  * @param term
  * @return
  */
 public static double compuScoreFreq(Term from, Term term) {
   // TODO Auto-generated method stub
   return from.getTermNatures().allFreq + term.getTermNatures().allFreq;
 }
  /**
   * 人名识别
   *
   * @param term
   * @param offe
   * @param freq
   */
  private Term nameFind(int offe, int beginFreq, int size) {
    // TODO Auto-generated method stub
    StringBuilder sb = new StringBuilder();
    int undefinite = 0;
    skip = false;
    PersonNatureAttr pna = null;
    int index = 0;
    int freq = 0;
    double allFreq = 0;
    Term term = null;
    int i = offe;
    for (; i < terms.length; i++) {
      // 走到结尾处识别出来一个名字.
      if (terms[i] == null) {
        continue;
      }
      term = terms[i];
      pna = term.getTermNatures().personAttr;
      // 在这个长度的这个位置的词频,如果没有可能就干掉,跳出循环
      if ((freq = pna.getFreq(size, index)) == 0) {
        return null;
      }

      if (pna.allFreq > 0) {
        undefinite++;
      }
      sb.append(term.getName());
      allFreq += Math.log(term.getTermNatures().allFreq + 1);
      allFreq += -Math.log((freq));
      index++;

      if (index == size + 2) {
        break;
      }
    }

    double score = -Math.log(FACTORY[size]);
    score += allFreq;
    double endFreq = 0;
    // 开始寻找结尾词
    boolean flag = true;
    while (flag) {
      i++;
      if (i >= terms.length) {
        endFreq = 10;
        flag = false;
      } else if (terms[i] != null) {
        int twoWordFreq = NgramLibrary.getTwoWordFreq(term, terms[i]);
        if (twoWordFreq > 3) {
          return null;
        }
        endFreq = terms[i].getTermNatures().personAttr.end + 1;
        flag = false;
      }
    }

    score -= Math.log(endFreq);
    score -= Math.log(beginFreq);

    if (score > -3) {
      return null;
    }

    if (allFreq > 0 && undefinite > 0) {
      return null;
    }

    skip = undefinite == 0;

    term = new Term(sb.toString(), offe, TermNatures.NR);
    term.selfScore = score;

    return term;
  }
  public void recognition() {
    if (branch == null) {
      return;
    }
    int length = terms.length - 1;

    Term term = null;
    for (int i = 0; i < length; i++) {
      if (terms[i] == null) {
        continue;
      } else {
        from = terms[i].getFrom();
        terms[i].score = 0;
        terms[i].selfScore = 0;
      }

      branch = branch.getBranch(terms[i].getName());

      if (branch == null || branch.getStatus() == 3) {
        reset();
        continue;
      }

      offe = i;

      // 循环查找添加
      term = terms[i];
      sb.append(term.getName());
      if (branch.getStatus() == 2) {
        term.selfScore = branch.getParam().getScore();
      }
      boolean flag = true;
      while (flag) {
        term = term.getTo();
        branch = branch.getBranch(term.getName());
        // 如果没有找到跳出
        if (branch == null) {
          break;
        }

        switch (branch.getStatus()) {
          case 1:
            sb.append(term.getName());
            continue;
          case 2:
            sb.append(term.getName());
            score = branch.getParam().getScore();
            tempNature = branch.getParam().getNature();
            to = term.getTo();
            makeNewTerm();
            continue;
          case 3:
            sb.append(term.getName());
            score = branch.getParam().getScore();
            tempNature = branch.getParam().getNature();
            to = term.getTo();
            makeNewTerm();
            flag = false;
            break;
          default:
            System.out.println("怎么能出现0呢?");
            break;
        }
      }
      reset();
    }
  }
  /**
   * 数字+数字合并,zheng
   *
   * @param terms
   */
  public static void recognition(Term[] terms) {
    int length = terms.length - 1;
    Term from = null;
    Term to = null;
    Term temp = null;
    for (int i = 0; i < length; i++) {
      if (terms[i] == null) {
        continue;
      } else if (".".equals(terms[i].getName())) {
        // 如果是.前后都为数字进行特殊处理
        to = terms[i].getTo();
        from = terms[i].getFrom();
        if (from.getTermNatures().numAttr.flag && to.getTermNatures().numAttr.flag) {
          from.setName(from.getName() + "." + to.getName());
          TermUtil.termLink(from, to.getTo());
          terms[to.getOffe()] = null;
          terms[i] = null;
          i = from.getOffe() - 1;
        }
        continue;
      } else if (!terms[i].getTermNatures().numAttr.flag) {
        continue;
      }

      temp = terms[i];
      // 将所有的数字合并
      while ((temp = temp.getTo()).getTermNatures().numAttr.flag) {
        terms[i].setName(terms[i].getName() + temp.getName());
      }
      // 如果是数字结尾
      if (temp.getTermNatures().numAttr.numEndFreq > 0) {
        terms[i].setName(terms[i].getName() + temp.getName());
        temp = temp.getTo();
      }

      // 如果不等,说明terms[i]发生了改变
      if (terms[i].getTo() != temp) {
        TermUtil.termLink(terms[i], temp);
        // 将中间无用元素设置为null
        for (int j = i + 1; j < temp.getOffe(); j++) {
          terms[j] = null;
        }
        i = temp.getOffe() - 1;
      }
    }
  }