/** 获取文章作者,发布时间等数据 */ public void processAuthor() throws Exception { HashMap map = new HashMap(); map.put("status", 1); List<ArticleDoc> list = articleDocDao.find(map); for (ArticleDoc bean : list) { if (!bean.getUrl().startsWith("http://www.pcpop.com/doc/")) { continue; } logger.debug("获取文章数据"); // 更新文章的作者和发布时间 try { String author = author(bean.getUrl()); String tmp1 = author.substring(author.lastIndexOf("作者") + 3, author.lastIndexOf("编辑") - 1); String tmp2 = author.substring(0, author.indexOf(":") - 2); bean.setAuthor(tmp1); bean.setPublishTime(tmp2); bean.setStatus(2); if (articleDocDao.update(bean) == 0) { logger.info("更新作者失败:" + bean.getUrl()); } else { processCount++; logger.info( "[" + bean.getId() + "]更新成功:" + bean.getUrl() + "\tprocessCount:" + processCount); } } catch (java.io.FileNotFoundException e) { bean.setStatus(10); if (articleDocDao.update(bean) == 0) { logger.error("更新作者失败:" + bean.getUrl()); } else { logger.error("[" + bean.getId() + "]更新记录状态为10[文件或地址查找找不到]:" + bean.getUrl()); } continue; } catch (org.htmlparser.util.ParserException e) { bean.setStatus(10); if (articleDocDao.update(bean) == 0) { logger.error("更新作者失败:" + bean.getUrl()); } else { logger.error("[" + bean.getId() + "]更新记录状态为10[URL解析失败]:" + bean.getUrl()); } continue; } catch (Exception e) { bean.setStatus(11); bean.setContent(e.getMessage()); if (articleDocDao.update(bean) == 0) { logger.error("更新作者和文章发布时间失败:" + bean.getUrl()); } else { logger.error("[" + bean.getId() + "]更新记录状态为11[其他异常情况]:" + bean.getUrl()); } continue; } } }
void contentProcess() { HashMap map = new HashMap(); map.put("status", 2); try { List<ArticleDoc> list = articleDocDao.find(map); for (ArticleDoc bean : list) { if (!bean.getUrl().startsWith("http://www.pcpop.com/doc/")) { continue; } try { // String content = content(bean.getUrl()); // if(null != content){ // bean.setContent(content); bean.setStatus(3); if (articleDocDao.update(bean) == 0) { logger.debug("更新作者失败:" + bean.getUrl()); } else { if (null != client.get(getKey(bean.getUrl()))) { client.remove(getKey(bean.getUrl())); client.put(getKey(bean.getUrl()), bean); } logger.debug("[" + bean.getId() + "]更新文章内容成功"); } // } } catch (Exception e) { bean.setStatus(11); bean.setContent(e.getMessage()); if (articleDocDao.update(bean) == 0) { logger.debug("更新文章内容失败:" + bean.getUrl()); } else { logger.debug("[" + bean.getId() + "]更新记录状态为11[其他异常情况]:" + bean.getUrl()); } continue; } } } catch (Exception e) { } }
public void processWithDoc() throws Exception { // Iterator hit = HTMLHASH.keySet().iterator(); // while(hit.hasNext()){ // String key = (String) hit.next(); // logger.debug("key:"+key); // String[] keys = key.split(":"); // String content = (String) HTMLHASH.get(key); // try{ // docByHTML(content, "http://www.pcpop.com/doc/"); // Iterator it = LINKHASH.keySet().iterator(); // ArticleDoc doc = null; // while (it.hasNext()) { // String key1 = (String) it.next(); // if(null == client.get(getKey(key1))){ // LinkBean link = (LinkBean) LINKHASH.get(key1); // doc = new ArticleDoc(); // doc.setTitle(link.getName()); // doc.setUrl(link.getLink()); // doc.setWebId(Integer.valueOf(keys[0])); // int id = articleDocDao.insert(doc); // if(!(id>0)){ // logger.debug("失败,\t链接名称:" + link.getName() + "\n链接地址:"+ // link.getLink()); // }else{ // doc.setId(id); // doc.setStatus(1); // client.add(getKey(doc.getUrl()), doc); // logger.debug("Memcached now store this object"); // } // } // } // LINKHASH.clear(); // }catch(Exception e){ // logger.debug("解析异常,跳过:"+key+"\tException:"+e.getMessage()); // continue; // } // } HashMap map = new HashMap(); map.put("status", 1); List<ArticleDoc> list = articleDocDao.find(map); for (ArticleDoc bean : list) { if (!bean.getUrl().startsWith("http://www.pcpop.com/doc/")) { continue; } logger.debug("获取文章数据"); // 更新文章的作者和发布时间 try { String author = author(bean.getUrl()); String tmp1 = author.substring(author.lastIndexOf("作者") + 3, author.lastIndexOf("编辑") - 1); String tmp2 = author.substring(0, author.indexOf(":") - 2); bean.setAuthor(tmp1); bean.setPublishTime(tmp2); bean.setStatus(2); if (articleDocDao.update(bean) == 0) { logger.debug("更新作者失败:" + bean.getUrl()); } else { logger.debug("[" + bean.getId() + "]更新成功:" + bean.getUrl()); } } catch (java.io.FileNotFoundException e) { bean.setStatus(10); if (articleDocDao.update(bean) == 0) { logger.debug("更新作者失败:" + bean.getUrl()); } else { logger.debug("[" + bean.getId() + "]更新记录状态为10[文件或地址查找找不到]:" + bean.getUrl()); } continue; } catch (org.htmlparser.util.ParserException e) { bean.setStatus(10); if (articleDocDao.update(bean) == 0) { logger.debug("更新作者失败:" + bean.getUrl()); } else { logger.debug("[" + bean.getId() + "]更新记录状态为10[URL解析失败]:" + bean.getUrl()); } continue; } catch (Exception e) { bean.setStatus(11); bean.setContent(e.getMessage()); if (articleDocDao.update(bean) == 0) { logger.debug("更新作者和文章发布时间失败:" + bean.getUrl()); } else { logger.debug("[" + bean.getId() + "]更新记录状态为11[其他异常情况]:" + bean.getUrl()); } continue; } } }