/** * @author dengxiang.liu * @description update jieba user dictionary with securities and themes just came up recently; * should be called before getThemeSecPair methods; * @see */ public void updateLocalJiebaDictWithSecAndTheme() { String jiebaUserDictPath = JiebaUserDictPath; logger.info("jieba user dictionary file path: {}", jiebaUserDictPath); try { List<DatayesdbpSecurity> securityList = datayesdbpMapper.getSecurityList(); Map<String, String> tokenMap = FileIOUtil.readJiebaDict(jiebaUserDictPath); logger.info("original jieba dictionary size: {}", tokenMap == null ? 0 : tokenMap.size()); Integer addedNewsWordsCount = 0; for (DatayesdbpSecurity datayesdbpSecurity : securityList) { if (!tokenMap.containsKey(datayesdbpSecurity.getSecShortName())) { tokenMap.put(datayesdbpSecurity.getSecShortName(), "n"); addedNewsWordsCount++; } } List<BigdataTheme> bigdataThemeList = bigdataMapper.getThemeList(); for (BigdataTheme bigdataTheme : bigdataThemeList) { if (!tokenMap.containsKey(bigdataTheme.getThemeName())) { tokenMap.put(bigdataTheme.getThemeName(), "n"); } } FileIOUtil.writeJiebaDict(jiebaUserDictPath, tokenMap); logger.info("Add {} new words into jieba dictionary.", addedNewsWordsCount); } catch (Exception e) { logger.error("error occurs when update jieba dictionary, {}", e.toString()); } }
/** * @Author: Dengxiang.Liu @Return: Map<Long, List<ThemeSecurityPair>>; a map from themeID to it's * ThemeSecurityPair List; @Desc: the result will be QA before insert into Database; */ public Map<String, List<ThemeSecurityObjPair>> getThemeSecPairToBeQA() { logger.info("start getThemeSecPairToBeQA();"); Map<String, Set<String>> curThemeSecPairMap = getCurrentThemeSecurityPair(); List<DatayesdbpSecurity> securityList = datayesdbpMapper.getSecurityList(); /* handle new themes */ List<BigdataTheme> bigdataThemeList = bigdataMapper.getThemeList(); // all themes list; logger.info( "all themes list size: {};", bigdataThemeList == null ? 0 : bigdataThemeList.size()); List<BigdataTheme> unhandledThemeList = getUnhandledThemeList(bigdataThemeList); logger.info( "unhandled themes list size: {};", unhandledThemeList == null ? 0 : unhandledThemeList.size()); Set<BigdataTheme> unhandledThemeSet = (unhandledThemeList == null || unhandledThemeList.isEmpty()) ? null : new HashSet<BigdataTheme>(unhandledThemeList); Map<String, List<ThemeSecurityObjPair>> newsThemeSecPairMapUnhandled = textAnalizer.findThemesSecPair(unhandledThemeSet, securityList, false); Map<String, List<ThemeSecurityObjPair>> crawlThemeSecPairMapUnhandled = mongoDataAnalizer.findThemeSecPair(unhandledThemeSet, securityList, false); Map<String, List<ThemeSecurityObjPair>> newsThemeSecPairMapToBeQAUnhandled = getDiffThemeSecPairBetween(curThemeSecPairMap, newsThemeSecPairMapUnhandled); Map<String, List<ThemeSecurityObjPair>> crawlThemeSecPairMapToBeQAUnhandled = getDiffThemeSecPairBetween(curThemeSecPairMap, crawlThemeSecPairMapUnhandled); /* handle schedule themes: schedule-themes = all-themes - unhandled-themes*/ List<BigdataTheme> scheduleThemeList = getSchedualThemeList(bigdataThemeList, unhandledThemeList); Set<BigdataTheme> scheduleThemeSet = new HashSet<BigdataTheme>( scheduleThemeList == null ? new ArrayList<BigdataTheme>() : scheduleThemeList); logger.info( "schedule themes Set size: {}", scheduleThemeSet == null ? 0 : scheduleThemeSet.size()); Map<String, List<ThemeSecurityObjPair>> newsThemeSecPairMapSchedule = textAnalizer.findThemesSecPair(scheduleThemeSet, securityList, true); Map<String, List<ThemeSecurityObjPair>> crawlThemeSecPairMapSchedule = mongoDataAnalizer.findThemeSecPair(scheduleThemeSet, securityList, true); Map<String, List<ThemeSecurityObjPair>> newsThemeSecPairMapToBeQASchedule = getDiffThemeSecPairBetween(curThemeSecPairMap, newsThemeSecPairMapSchedule); Map<String, List<ThemeSecurityObjPair>> crawlThemeSecPairMapToBeQASchedule = getDiffThemeSecPairBetween(curThemeSecPairMap, crawlThemeSecPairMapSchedule); /* integrate theme-security pair maps */ Map<String, List<ThemeSecurityObjPair>> globalThemeSecPairMapToBeQA = new HashMap<String, List<ThemeSecurityObjPair>>(); IntegrateMaps(globalThemeSecPairMapToBeQA, newsThemeSecPairMapToBeQAUnhandled); IntegrateMaps(globalThemeSecPairMapToBeQA, newsThemeSecPairMapToBeQASchedule); IntegrateMaps(globalThemeSecPairMapToBeQA, crawlThemeSecPairMapToBeQAUnhandled); IntegrateMaps(globalThemeSecPairMapToBeQA, crawlThemeSecPairMapToBeQASchedule); logger.info("finished getThemeSecPairToBeQA();"); return globalThemeSecPairMapToBeQA; }