public static void SplitBIG5() { try { ICTCLAS2011 testICTCLAS2011 = new ICTCLAS2011(); String argu = "."; if (ICTCLAS2011.ICTCLAS_Init(argu.getBytes("GB2312"), 2) == false) { // UTF8切分 System.out.println("Init Fail!"); return; } String argu1 = "TestBIG.txt"; String argu2 = "TestBIG_result.txt"; testICTCLAS2011.ICTCLAS_FileProcess(argu1.getBytes("GB2312"), argu2.getBytes("GB2312"), 1); // 释放分词组件资源 ICTCLAS2011.ICTCLAS_Exit(); } catch (Exception ex) { } }
public static void Split(String sInput) { try { ICTCLAS2011 testICTCLAS2011 = new ICTCLAS2011(); String argu = "F:\\workspace\\wordAutoErrorCorrection\\"; System.out.println("ICTCLAS_Init"); if (ICTCLAS2011.ICTCLAS_Init(argu.getBytes("GB2312"), 0) == false) { System.out.println("Init Fail!"); return; } /* * 设置词性标注集 ID 代表词性集 1 计算所一级标注集 0 计算所二级标注集 2 北大二级标注集 3 北大一级标注集 */ testICTCLAS2011.ICTCLAS_SetPOSmap(2); // 导入用户词典前 byte nativeBytes[] = testICTCLAS2011.ICTCLAS_ParagraphProcess(sInput.getBytes("GB2312"), 0); String nativeStr = new String(nativeBytes, 0, nativeBytes.length, "GB2312"); System.out.println("未导入用户词典: " + nativeStr); // 文件分词 String argu1 = "movie.txt"; String argu2 = "TestGBK_result.txt"; testICTCLAS2011.ICTCLAS_FileProcess(argu1.getBytes("GB2312"), argu2.getBytes("GB2312"), 0); /* //导入用户词典 String sUserDict = "userdic.txt"; int nCount = testICTCLAS2011.ICTCLAS_ImportUserDict(sUserDict.getBytes("GB2312")); testICTCLAS2011.ICTCLAS_SaveTheUsrDic();//保存用户词典 System.out.println("导入个用户词: " + nCount); nativeBytes = testICTCLAS2011.ICTCLAS_ParagraphProcess(sInput.getBytes("GB2312"), 1); nativeStr = new String(nativeBytes, 0, nativeBytes.length, "GB2312"); System.out.println("导入用户词典后: " + nativeStr); //动态添加用户词 String sWordUser = "******"; testICTCLAS2011.ICTCLAS_AddUserWord(sWordUser.getBytes("GB2312")); testICTCLAS2011.ICTCLAS_SaveTheUsrDic();//保存用户词典 nativeBytes = testICTCLAS2011.ICTCLAS_ParagraphProcess(sInput.getBytes("GB2312"), 1); nativeStr = new String(nativeBytes, 0, nativeBytes.length, "GB2312"); System.out.println("动态添加用户词后: " + nativeStr); //分词高级接口 nativeBytes = testICTCLAS2011.nativeProcAPara(sInput.getBytes("GB2312")); int nativeElementSize = testICTCLAS2011.ICTCLAS_GetElemLength(0);//size of result_t in native code int nElement = nativeBytes.length / nativeElementSize; byte nativeBytesTmp[] = new byte[nativeBytes.length]; //关键词提取 int nCountKey = testICTCLAS2011.ICTCLAS_KeyWord(nativeBytesTmp, nElement); Result[] resultArr = new Result[nCountKey]; DataInputStream dis = new DataInputStream(new ByteArrayInputStream(nativeBytesTmp)); int iSkipNum; for (int i = 0; i < nCountKey; i++) { resultArr[i] = new Result(); resultArr[i].start = Integer.reverseBytes(dis.readInt()); iSkipNum = testICTCLAS2011.ICTCLAS_GetElemLength(1) - 4; if (iSkipNum > 0) { dis.skipBytes(iSkipNum); } resultArr[i].length = Integer.reverseBytes(dis.readInt()); iSkipNum = testICTCLAS2011.ICTCLAS_GetElemLength(2) - 4; if (iSkipNum > 0) { dis.skipBytes(iSkipNum); } dis.skipBytes(testICTCLAS2011.ICTCLAS_GetElemLength(3)); resultArr[i].posId = Integer.reverseBytes(dis.readInt()); iSkipNum = testICTCLAS2011.ICTCLAS_GetElemLength(4) - 4; if (iSkipNum > 0) { dis.skipBytes(iSkipNum); } resultArr[i].wordId = Integer.reverseBytes(dis.readInt()); iSkipNum = testICTCLAS2011.ICTCLAS_GetElemLength(5) - 4; if (iSkipNum > 0) { dis.skipBytes(iSkipNum); } resultArr[i].word_type = Integer.reverseBytes(dis.readInt()); iSkipNum = testICTCLAS2011.ICTCLAS_GetElemLength(6) - 4; if (iSkipNum > 0) { dis.skipBytes(iSkipNum); } resultArr[i].weight = Integer.reverseBytes(dis.readInt()); iSkipNum = testICTCLAS2011.ICTCLAS_GetElemLength(7) - 4; if (iSkipNum > 0) { dis.skipBytes(iSkipNum); } } dis.close(); for (int i = 0; i < resultArr.length; i++) { System.out.println("start=" + resultArr[i].start + ",length=" + resultArr[i].length + "pos=" + resultArr[i].posId + "word=" + resultArr[i].wordId + " weight=" + resultArr[i].weight); }*/ // 释放分词组件资源 ICTCLAS2011.ICTCLAS_Exit(); } catch (Exception ex) { } }