/** * @author dengxiang.liu * @description update jieba user dictionary with securities and themes just came up recently; * should be called before getThemeSecPair methods; * @see */ public void updateLocalJiebaDictWithSecAndTheme() { String jiebaUserDictPath = JiebaUserDictPath; logger.info("jieba user dictionary file path: {}", jiebaUserDictPath); try { List<DatayesdbpSecurity> securityList = datayesdbpMapper.getSecurityList(); Map<String, String> tokenMap = FileIOUtil.readJiebaDict(jiebaUserDictPath); logger.info("original jieba dictionary size: {}", tokenMap == null ? 0 : tokenMap.size()); Integer addedNewsWordsCount = 0; for (DatayesdbpSecurity datayesdbpSecurity : securityList) { if (!tokenMap.containsKey(datayesdbpSecurity.getSecShortName())) { tokenMap.put(datayesdbpSecurity.getSecShortName(), "n"); addedNewsWordsCount++; } } List<BigdataTheme> bigdataThemeList = bigdataMapper.getThemeList(); for (BigdataTheme bigdataTheme : bigdataThemeList) { if (!tokenMap.containsKey(bigdataTheme.getThemeName())) { tokenMap.put(bigdataTheme.getThemeName(), "n"); } } FileIOUtil.writeJiebaDict(jiebaUserDictPath, tokenMap); logger.info("Add {} new words into jieba dictionary.", addedNewsWordsCount); } catch (Exception e) { logger.error("error occurs when update jieba dictionary, {}", e.toString()); } }
/** * @Author: dengxiang.liu @Desc: get unhandled themes from the different between themes list in * database and themes list in local file; @See: updateHandledThemeFile() */ public List<BigdataTheme> getUnhandledThemeList(List<BigdataTheme> bigdataThemeList) { /** get themes have been handled from the file */ Map<Long, BigdataTheme> handledThemeMap = FileIOUtil.readThemeMap(HANDLED_THEMES_FILE_PATH); /** themes to be handled */ List<BigdataTheme> unHandledThemeList = new ArrayList<BigdataTheme>(); if (null == bigdataThemeList) return unHandledThemeList; for (BigdataTheme bigdataTheme : bigdataThemeList) { try { Long themeID = bigdataTheme.getThemeID(); if (!handledThemeMap.containsKey(themeID)) { unHandledThemeList.add(bigdataTheme); } } catch (Exception e) { logger.error(e.toString()); } } logger.info("get unhandled theme list size: {}", unHandledThemeList.size()); return unHandledThemeList; }
/** * @author dengxiang.liu * @param mongoThemeStockList * @param securityList * @param bigdataThemeSet * @return theme security pairs list @Desc get theme security pairs from mongo data about target * theme; */ public List<ThemeSecurityObjPair> getThemeSecurityPairList( List<MongoThemeStock> mongoThemeStockList, List<DatayesdbpSecurity> securityList, Set<BigdataTheme> bigdataThemeSet) { List<ThemeSecurityObjPair> themeSecurityPairList = new ArrayList<ThemeSecurityObjPair>(); /** used to make sure the uniqueness of theme-security pair: themeID ---> tickerSymbol set */ Map<Long, Set<String>> globalThemeSecPairMap = new HashMap<Long, Set<String>>(); if (mongoThemeStockList == null || securityList == null) return themeSecurityPairList; /** @Map: security ticer symbol ---> DatayesdbpSecurity Object */ Map<String, DatayesdbpSecurity> tickerSymbol2Obj = new HashMap<String, DatayesdbpSecurity>(); for (DatayesdbpSecurity security : securityList) { String tickerSymbol = security.getTickerSymbol(); tickerSymbol2Obj.put(tickerSymbol, security); } /** @Map: theme name ---> BigdataTheme themeID */ Map<String, Long> themeName2ID = new HashMap<String, Long>(); for (BigdataTheme bigdataTheme : bigdataThemeSet) { Long themeID = bigdataTheme.getThemeID(); String themeName = bigdataTheme.getThemeName(); themeName2ID.put(themeName, themeID); } for (MongoThemeStock mongoThemeStock : mongoThemeStockList) { String themeName = mongoThemeStock.getThemeName(); String dateStr = mongoThemeStock.getDate(); Date findTime = null; try { findTime = DateUtil.strToDate(dateStr, DateUtil.DatePattern.day); } catch (ParseException e) { e.printStackTrace(); } /** theme-security pair description */ String webSite = mongoThemeStock.getWebSite(); String pairType = "crawl"; // List<String> tickerSymbolList = new ArrayList<String>(); try { List<String> tickerSymbolList = mongoThemeStock.getRelateStock(); if (tickerSymbolList == null || tickerSymbolList.isEmpty()) continue; for (String tickerSymbol : tickerSymbolList) { if (!tickerSymbol2Obj.containsKey(tickerSymbol)) continue; DatayesdbpSecurity curSecurity = tickerSymbol2Obj.get(tickerSymbol); /** * if themes came from database do not contains the current theme (comes from mongo db), * set mongo theme's id to -1 */ Long themeID = themeName2ID.containsKey(themeName) ? themeName2ID.get(themeName) : -1L; if (themeID == -1L || (globalThemeSecPairMap.containsKey(themeID) && globalThemeSecPairMap.get(themeID).contains(curSecurity.getTickerSymbol()))) continue; ThemeSecurityObjPair themeSecurityPair = new ThemeSecurityObjPair( themeID, themeName, curSecurity, webSite, findTime, pairType); themeSecurityPairList.add(themeSecurityPair); if (!globalThemeSecPairMap.containsKey(themeID)) globalThemeSecPairMap.put(themeID, new HashSet<String>()); globalThemeSecPairMap.get(themeID).add(curSecurity.getTickerSymbol()); } } catch (Exception e) { logger.error(e.toString()); } } return themeSecurityPairList; }
/** * @Author: Dengxiang.Liu @Param: themeList; could be themes have never been * handled(getUnhandledThemeList()) or themes have been handled, depends on @Param * isScheduale; @Param: isSchedule; true if the themes arn't new to system; @Return: Map<Long, * List<ThemeSecurityPair>>; a map from themeID to it's ThemeSecurityPair List; @Desc: the result * will be QA before insert into Database; */ public Map<String, List<ThemeSecurityObjPair>> findThemeSecPair( Set<BigdataTheme> themeSet, List<DatayesdbpSecurity> securityList, Boolean isSchedule) { Map<String, List<ThemeSecurityObjPair>> themeSecurityPairMap = new HashMap<String, List<ThemeSecurityObjPair>>(); logger.info("findThemeSecPair with mongo data"); logger.info( "themeSet size: {}; securityList size: {}, isSchedule: {}", themeSet == null ? 0 : themeSet.size(), securityList == null ? 0 : securityList.size(), isSchedule); if (null == themeSet) return themeSecurityPairMap; Date endDate = new Date(); /** configure data scale */ Integer mongoTimeInterval = isSchedule ? 5 : 50; try { mongoTimeInterval = isSchedule ? Integer.parseInt(ConfigConst.MONGO_TIME_INTERVAL_SCHEDULE) : Integer.parseInt(ConfigConst.MONGO_TIME_INTERVAL_UNSCHEDULE); } catch (Exception e) { e.printStackTrace(); mongoTimeInterval = isSchedule ? 5 : 50; } Date mongoStartDate = DateUtil.addDay(endDate, -1 * mongoTimeInterval); logger.info( "mongo data from: {}, to {}", DateUtil.dateToStr(mongoStartDate, DateUtil.DatePattern.day), DateUtil.dateToStr(endDate, DateUtil.DatePattern.day2)); for (BigdataTheme bigdataTheme : themeSet) { try { List<ThemeSecurityObjPair> themeSecurityPairList = new ArrayList<ThemeSecurityObjPair>(); Long themeID = bigdataTheme.getThemeID(); String themeName = bigdataTheme.getThemeName(); /** get mongo data ; with theme_name */ List<MongoThemeStock> mongoThemeStockList = mongoThemeStockDao.searchByThemeName( themeName, DateUtil.dateToStr(mongoStartDate, DateUtil.DatePattern.day), DateUtil.dateToStr(endDate, DateUtil.DatePattern.day)); logger.info( "themeName: {}, mongoThemeStockList size: {}", themeName, mongoThemeStockList == null ? 0 : mongoThemeStockList.size()); /** get theme-security pair from mongo data */ List<ThemeSecurityObjPair> themeSecurityPairListMongo = getThemeSecurityPairList(mongoThemeStockList, securityList, themeSet); if (null != themeSecurityPairListMongo) themeSecurityPairList.addAll(themeSecurityPairListMongo); /** @Map themeName ---> ThemeSecurityPair List */ themeSecurityPairMap.put(themeName, themeSecurityPairList); } catch (Exception e) { logger.error(e.toString()); } } return themeSecurityPairMap; }