Example #1
0
  private static void setStandardIdsToMap(Integer pageNo, String html) throws Exception {
    Parser parser = Parser.createParser(html, "gb2312");
    AndFilter viewStateFilter =
        new AndFilter(
            new TagNameFilter("table"),
            new HasAttributeFilter("id", "ctl00_ContentPlaceHolder1_StandardView"));

    NodeList nodes = parser.parse(viewStateFilter);
    TableTag node = (TableTag) nodes.elementAt(0);

    TableRow[] rows = node.getRows();
    for (int i = 1; i < rows.length; i++) {
      TableColumn[] cols = rows[i].getColumns();
      TableColumn col = cols[3];
      LinkTag tag = (LinkTag) ((Div) col.getChildren().elementAt(1)).getChildren().elementAt(2);
      if (tag == null) {
        List<Integer> lst = error.get(pageNo);
        if (lst == null) {
          lst = new ArrayList<Integer>();
        }
        lst.add(i);
        error.put(pageNo, lst);
        continue;
      }

      String href = tag.getAttribute("href");
      if (href == null) {
        List<Integer> lst = error.get(pageNo);
        if (lst == null) {
          lst = new ArrayList<Integer>();
        }
        lst.add(i);
        error.put(pageNo, lst);
        continue;
      }
      int start = href.indexOf("standardid=");
      int end = href.indexOf("&amp;");

      String standardId = href.substring(start, end).replaceAll("standardid=", "");

      List<String> lst = map.get(pageNo);
      if (lst == null) {
        lst = new ArrayList<String>();
      }
      lst.add(standardId);
      map.put(pageNo, lst);
    }
  }
Example #2
0
  private static void addDetailToMap(String key, String text) throws Exception {
    Parser parser = Parser.createParser(text, "gb2312");
    TagNameFilter tableFiler = new TagNameFilter("table");

    NodeList nodes = parser.parse(tableFiler);

    TableTag node = (TableTag) nodes.elementAt(5);

    TableRow[] rows = node.getRows();
    for (int i = 1; i < 11; i++) {
      TableColumn[] cols = rows[i].getColumns();

      StringBuffer txt1 = new StringBuffer();

      StringBuffer txt2 = new StringBuffer();

      NodeList span1 = cols[1].getChildren().elementAt(1).getChildren();

      for (int j = 0; j < span1.size(); j++) {
        if (span1.elementAt(j) instanceof TextNode) {
          txt1.append(span1.elementAt(j).getText()).append(" ");
        }
      }

      NodeList span2 = cols[3].getChildren().elementAt(1).getChildren();

      for (int j = 0; j < span2.size(); j++) {
        if (span2.elementAt(j) instanceof TextNode) {
          txt2.append(span2.elementAt(j).getText()).append(" ");
        }
      }

      List<String> lst = detailMap.get(key);
      if (lst == null) {
        lst = new ArrayList<String>();
      }
      lst.add(txt1.toString().trim());
      lst.add(txt2.toString().trim());
      detailMap.put(key, lst);
    }
  }
  @Override
  public LotteryDraw fetchResultDetail(String phase) {

    LotteryDraw lotteryDraw = null;
    lotteryDraw = nowPhaseResult();
    if (phase == null || "".equals(phase) || lotteryDraw.getPhase().equals(phase)) {
      return lotteryDraw;
    } else {
      lotteryDraw = null;
    }

    String url = RESULT_MORE_LOCALITY_URL;

    String data = null;
    String pageInfo = "结果页面" + url;
    String encoding = "utf-8";
    String logHeader =
        "=="
            + lotteryScope
            + "=="
            + siteName
            + "=="
            + pageInfo
            + "==抓取=="
            + getLotteryType().getName()
            + "==";

    try {
      data = CoreFetcherUtils.URLGet(url, null, encoding);
    } catch (Exception e) {
      logger.error("获取html数据失败" + e.getMessage());
      return null;
    }

    if (data == null || data.indexOf("404 Not Found") > 0 || data.isEmpty()) {
      logger.error(logHeader + "data is null or 404 Not Found");
      return null;
    }
    Parser parser = null;
    try {
      parser = Parser.createParser(data, encoding);
    } catch (Exception e) {
      logger.error("解析html页面失败" + e.getMessage());
      return null;
    }
    NodeFilter filter = new HasAttributeFilter("class", "mytable");
    NodeList nodeList = null;

    try {
      nodeList = parser.extractAllNodesThatMatch(filter);
      TableTag tableTag = (TableTag) nodeList.elementAt(0);
      TableRow[] tableRows = tableTag.getRows();
      for (int i = 1; i < tableRows.length; i++) {
        TableColumn[] tableColumns = tableRows[i].getColumns();
        String phaseTmp = tableColumns[0].toPlainTextString();
        if (phaseTmp != null && !"".equals(phaseTmp) && phase.equals(phaseTmp)) {
          lotteryDraw = new LotteryDraw();
          // 彩期
          lotteryDraw.setPhase(phaseTmp);
          // 开奖结果
          String strResult = tableColumns[1].toPlainTextString();
          strResult = strResult.trim().replace(" ", ",");
          lotteryDraw.setResult(strResult);
          // 彩种
          lotteryDraw.setLotteryType(super.getLotteryType());
          break;
        }
      }
    } catch (ParserException e) {
      logger.error("数据解析错误==" + e.getMessage(), e);
      return null;
    }
    return lotteryDraw;
  }
Example #4
0
  /** 从课表处,分课表 */
  public List<Courses> parseCourses(String html) {
    Parser parser = new Parser();
    try {
      parser.setInputHTML(html);
      parser.setEncoding("utf-8");
    } catch (ParserException e) {
      e.printStackTrace();
    }
    NodeFilter filter = new NodeClassFilter(TableTag.class);
    NodeList nodeList = null;
    try {
      nodeList = parser.extractAllNodesThatMatch(filter);
    } catch (ParserException e) {
      e.printStackTrace();
    }

    List<Courses> list = new ArrayList<Courses>();
    String schoolyear = "";
    String semester = "";
    for (int i = 0; i < nodeList.size(); i++) {
      if (nodeList.elementAt(i) instanceof TableTag) {
        TableTag tag = (TableTag) nodeList.elementAt(i);
        TableRow[] rows = tag.getRows();
        for (int j = 0; j < rows.length; j++) {
          TableRow row = (TableRow) rows[j];
          TableColumn[] columns = row.getColumns();
          Courses courses = null;
          boolean isCourse = false;
          for (int k = 0; k < columns.length; k++) {
            Node columnNode = columns[k];
            String info = columnNode.toPlainTextString().trim();
            String temp = "学年学期:";
            int start = info.indexOf(temp);
            int len = "2012-2013".length();
            if (start != -1) {
              start = start + temp.length();
              schoolyear = info.substring(start, start + len);
              // semester = info.substring(start+len+2);
              // 网络正常时候测试学期改为数字
              semester = info.substring(start + len + 3, start + len + 4);
              if ("一".equals(semester)) {
                semester = "1";
              } else if ("二".equals(semester)) {
                semester = "2";
              }
            }
            if (k == 1 && info.indexOf("[") != -1) {
              courses = new Courses();
              String courseCode = info.substring(1, 9);
              String coursesname = info.substring(10);
              courses.setCourseCode(courseCode);
              courses.setCoursesname(coursesname);
              isCourse = true;
            }
            if (k == 2 && isCourse) {
              double credit = Double.parseDouble(info);
              courses.setCredit(credit);
            }
            if (k == 3 && isCourse) {
              courses.setType(info);
            }
            if (k == 4 && isCourse) {
              courses.setLeanType(info);
            }
            if (k == 5 && isCourse) {
              courses.setCheckType(info);
            }
            if (k == 6 && isCourse) {
              courses.setGetType(info);
            }
            if (k == 7 && isCourse) {
              //	double score=Double.parseDouble(info);
              courses.setScore(info);
            }
            if (k == 8 && isCourse) {
              courses.setRemark(info);
            }
          } // end for k
          if (courses != null) {
            courses.setSchoolYear(schoolyear);
            courses.setSemester(semester);
            list.add(courses);
          }
        } // end for j
      }
    }
    return list;
  }
Example #5
0
  public List<TimeTable> parseTimeTables(String html) {
    Parser parser = new Parser();
    try {
      parser.setInputHTML(html);
      parser.setEncoding("utf-8");
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    List<TimeTable> list = new ArrayList<TimeTable>();
    NodeFilter filter = new NodeClassFilter(TableTag.class);
    NodeList nodeList = null;
    try {
      nodeList = parser.extractAllNodesThatMatch(filter);
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    for (int i = 0; i < nodeList.size(); i++) {
      if (nodeList.elementAt(i) instanceof TableTag) {
        TableTag tag = (TableTag) nodeList.elementAt(i);
        if (tag.getText().indexOf("[课程号]") == -1) {
          continue;
        }
        TableRow[] rows = tag.getRows();
        for (int j = 1; j < rows.length; j++) {
          TableRow row = (TableRow) rows[j];
          TableColumn[] columns = row.getColumns();
          boolean isCourse = false;
          TimeTable timeTable = null;
          for (int k = 0; k < columns.length; k++) {
            Node columnNode = columns[k];
            String info = columnNode.toPlainTextString().trim();
            System.out.println(info + "===" + k);
            switch (k) {
              case 1:
                int start = info.indexOf("[");
                int end = info.indexOf("]");
                timeTable = new TimeTable();
                timeTable.setCourseCode(info.substring(start + 1, end));
                timeTable.setCourseName(info.substring(end + 1));
                break;
              case 3:
                timeTable.setCredit(Double.parseDouble(info));
                break;
              case 4:
                timeTable.setType(info);
                break;
              case 5:
                int t_start = info.indexOf("]");
                timeTable.setTeacher(info.substring(t_start + 1));
                break;
              case 8:
                List<TimeAndAdress> ta_list = praseStr(info);
                for (TimeAndAdress ta : ta_list) {
                  timeTable.setAddress(ta.getAddress());
                  timeTable.setTime(ta.getTime());
                  timeTable.setCycle(ta.getCycle());
                  timeTable.setSingleDouble(ta.getSingleDouble());
                  timeTable.setWeek(ta.getWeek());
                  list.add(timeTable.clone());
                }
                break;
              default:
                break;
            }
          }
        } // end for j
      }
    }
    return list;
  }
  @Override
  public List<JclqScheduleItem> fetchJclqSchedule(String officialDate) throws FetchFailedException {
    Map<String, String> headerParams = new HashMap<String, String>();
    headerParams.put("Referer", "http://info.sporttery.cn");
    headerParams.put(
        "User-Agent",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.168 Safari/535.19");
    List<JclqScheduleItem> jclqScheduleItemList = new ArrayList<JclqScheduleItem>();

    String encoding = "gbk";
    JclqScheduleItem jclqScheduleItem = null;
    String siteName = "中国竞彩网网[竞彩篮球赛程]";
    String logHeader = siteName + SCHEDULE_URL;

    try {
      String webInfo =
          CoreFetcherUtils.URLGetWithHeaderParams(SCHEDULE_URL, headerParams, null, encoding);
      if (webInfo == null || webInfo.indexOf("404 Not Found") > 0) {
        logger.error(logHeader + ",data is null or 404 Not Found");
        throw new FetchFailedException("404 Not Found");
      }

      Parser parser = Parser.createParser(webInfo, encoding);

      NodeList nodeList =
          parser.extractAllNodesThatMatch(new CssSelectorNodeFilter("div[class='box-tbl']"));
      if (null != nodeList && nodeList.size() > 0) {
        NodeFilter tableFilter = new TagNameFilter("table");
        Parser parser2 = Parser.createParser(nodeList.toHtml(), encoding);
        NodeList tableNodeList = parser2.extractAllNodesThatMatch(tableFilter);
        if (tableNodeList != null && tableNodeList.size() > 0) {
          TableTag catchTableTag = new TableTag();
          catchTableTag = (TableTag) tableNodeList.elementAt(0);
          if (catchTableTag != null) {
            TableRow[] catchRows = catchTableTag.getRows();
            TableColumn[] catchColumns = null;
            for (int i = 2; i < catchRows.length; i++) {
              catchColumns = catchRows[i].getColumns();
              if (catchColumns != null && catchColumns.length >= 6) {
                jclqScheduleItem = new JclqScheduleItem();

                String officialNum = catchColumns[0].toPlainTextString().trim();
                if (officialNum.length() < 5) {
                  continue;
                }

                // 先解析比赛时间
                String matchDateStr = catchColumns[3].toPlainTextString().trim();
                String[] yearStr = matchDateStr.split("-");
                if (yearStr[0].length() <= 2) {
                  matchDateStr = "20" + matchDateStr + ":00";
                } else {
                  matchDateStr = matchDateStr + ":00";
                }
                Date matchDate = CoreDateUtils.parseDate(matchDateStr, CoreDateUtils.DATETIME);
                Calendar matchDateCalendar = Calendar.getInstance();
                matchDateCalendar.setTime(matchDate);
                matchDateCalendar.add(Calendar.MINUTE, 1);
                jclqScheduleItem.setMatchDate(matchDateCalendar.getTime());

                // 根据周几、当前时间和比赛时间计算官方发布的日期
                Calendar cd = Calendar.getInstance();
                // 将时分秒等区域清零
                cd.set(Calendar.HOUR_OF_DAY, 0);
                cd.set(Calendar.MINUTE, 0);
                cd.set(Calendar.SECOND, 0);
                cd.set(Calendar.MILLISECOND, 0);

                int nowWeekDay = cd.get(Calendar.DAY_OF_WEEK);
                int fetchWeekDay = weekDay.get(officialNum.substring(0, 2));

                if (nowWeekDay != fetchWeekDay) {
                  int m = fetchWeekDay - nowWeekDay;
                  if (m < -1) {
                    cd.add(Calendar.DATE, m + 7);
                  } else {
                    cd.add(Calendar.DATE, m);
                  }
                }

                // 如果计算出来的日期超过了比赛时间,减去一周
                if (cd.after(matchDateCalendar)) {
                  cd.add(Calendar.DATE, -7);
                }

                // 如果计算出来的日期距离比赛时间相隔超过一周,加上一周的倍数
                // 一周的毫秒数
                long weekTimeInMillis = 3600 * 1000 * 24 * 7;
                long diffTimeInMillis = matchDateCalendar.getTimeInMillis() - cd.getTimeInMillis();
                if (diffTimeInMillis > weekTimeInMillis) {
                  // 计算相差几周
                  int diffWeekCount = (int) (diffTimeInMillis / weekTimeInMillis);
                  cd.add(Calendar.DATE, 7 * diffWeekCount);
                }

                jclqScheduleItem.setMatchNum(
                    CoreDateUtils.formatDate(cd.getTime(), "yyyyMMdd")
                        + LotteryConstant.JCLQ_MATCH_NUM_CODE_DEFAULT
                        + officialNum.substring(2));
                jclqScheduleItem.setOfficialDate(
                    CoreDateUtils.parseDate(CoreDateUtils.formatDate(cd.getTime())));
                Integer oNum = null;
                try {
                  oNum = Integer.valueOf(officialNum.substring(2));
                } catch (Exception e) {
                  logger.error("截取官方编码时,转换为Integer错误", e);
                }
                jclqScheduleItem.setOfficialNum(oNum);
                jclqScheduleItem.setMatchName(
                    JclqUtil.convertMatchName(
                        catchColumns[1].toPlainTextString().trim(),
                        LotteryType.JCLQ_SF,
                        FetcherType.T_PENGINEAPI));

                String team = catchColumns[2].toPlainTextString().trim();
                String[] teamStr = team.split("VS");
                jclqScheduleItem.setAwayTeam(teamStr[0].trim());
                jclqScheduleItem.setHomeTeam(teamStr[1].trim());

                if ("已开售".equals(catchColumns[4].toPlainTextString().trim())) {
                  jclqScheduleItem.setStatus(JclqRaceStatus.OPEN);
                } else {
                  jclqScheduleItem.setStatus(JclqRaceStatus.UNOPEN);
                }
                if (catchColumns[5].toPlainTextString().trim().indexOf("胜负单关") > 0) {
                  jclqScheduleItem.setDynamicSaleSfStatus(JclqDynamicSaleStatus.SALE_UNOPEN);
                } else {
                  jclqScheduleItem.setDynamicSaleSfStatus(JclqDynamicSaleStatus.SALE_OPEN);
                }
                if (catchColumns[5].toPlainTextString().trim().indexOf("胜负过关") > 0) {
                  jclqScheduleItem.setStaticSaleSfStatus(JclqStaticSaleStatus.SALE_UNOPEN);
                } else {
                  jclqScheduleItem.setStaticSaleSfStatus(JclqStaticSaleStatus.SALE_OPEN);
                }
                if (catchColumns[5].toPlainTextString().trim().indexOf("让分胜负单关") > 0) {
                  jclqScheduleItem.setDynamicSaleRfsfStatus(JclqDynamicSaleStatus.SALE_UNOPEN);
                } else {
                  jclqScheduleItem.setDynamicSaleRfsfStatus(JclqDynamicSaleStatus.SALE_OPEN);
                }
                if (catchColumns[5].toPlainTextString().trim().indexOf("让分胜负过关") > 0) {
                  jclqScheduleItem.setStaticSaleRfsfStatus(JclqStaticSaleStatus.SALE_UNOPEN);
                } else {
                  jclqScheduleItem.setStaticSaleRfsfStatus(JclqStaticSaleStatus.SALE_OPEN);
                }
                if (catchColumns[5].toPlainTextString().trim().indexOf("胜分差单关") > 0) {
                  jclqScheduleItem.setDynamicSaleSfcStatus(JclqDynamicSaleStatus.SALE_UNOPEN);
                } else {
                  jclqScheduleItem.setDynamicSaleSfcStatus(JclqDynamicSaleStatus.SALE_OPEN);
                }
                if (catchColumns[5].toPlainTextString().trim().indexOf("胜分差过关") > 0) {
                  jclqScheduleItem.setStaticSaleSfcStatus(JclqStaticSaleStatus.SALE_UNOPEN);
                } else {
                  jclqScheduleItem.setStaticSaleSfcStatus(JclqStaticSaleStatus.SALE_OPEN);
                }
                if (catchColumns[5].toPlainTextString().trim().indexOf("大小分单关") > 0) {
                  jclqScheduleItem.setDynamicSaleDxfStatus(JclqDynamicSaleStatus.SALE_UNOPEN);
                } else {
                  jclqScheduleItem.setDynamicSaleDxfStatus(JclqDynamicSaleStatus.SALE_OPEN);
                }
                if (catchColumns[5].toPlainTextString().trim().indexOf("大小分过关") > 0) {
                  jclqScheduleItem.setStaticSaleDxfStatus(JclqStaticSaleStatus.SALE_UNOPEN);
                } else {
                  jclqScheduleItem.setStaticSaleDxfStatus(JclqStaticSaleStatus.SALE_OPEN);
                }
                jclqScheduleItemList.add(jclqScheduleItem);
              }
            } // end for catchRows
          } // end if catchTableTag!=null
        } // end if(tableNodeList!=null&&tableNodeList.size()>0)
      } else {
        logger.error(logHeader + "竞彩篮球赛程数据表格不存在,返回null");
        throw new FetchFailedException("竞彩篮球赛程数据表格不存在");
      }
    } catch (Exception e) {
      logger.error(logHeader + "竞彩篮球赛程错误" + e.getMessage(), e);
      throw new FetchFailedException(e.getMessage());
    }
    return jclqScheduleItemList;
  }
Example #7
0
  /**
   * Creates a list of Grids based on the given HTML string. This works only for table-based HTML
   * documents.
   *
   * @param html the HTML string.
   * @return a list of Grids.
   */
  public static List<Grid> fromHtml(String html) throws Exception {
    if (html == null || html.trim().isEmpty()) {
      return null;
    }

    List<Grid> grids = new ArrayList<>();

    Parser parser = Parser.createParser(html, "UTF-8");

    Node[] tables = parser.extractAllNodesThatMatch(new TagNameFilter("table")).toNodeArray();

    for (Node t : tables) {
      Grid grid = new ListGrid();

      TableTag table = (TableTag) t;

      TableRow[] rows = table.getRows();

      Integer firstColumnCount = null;

      for (TableRow row : rows) {
        if (getColumnCount(row) == 0) // Ignore if no cells
        {
          log.warn("Ignoring row with no columns");
          continue;
        }

        Node[] cells = row.getChildren().extractAllNodesThatMatch(HTML_ROW_FILTER).toNodeArray();

        if (firstColumnCount == null) // First row becomes header
        {
          firstColumnCount = getColumnCount(row);

          for (Node c : cells) {
            TagNode cell = (TagNode) c;

            grid.addHeader(new GridHeader(getValue(cell), false, false));

            Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan"));

            if (colSpan != null && colSpan > 1) {
              grid.addEmptyHeaders((colSpan - 1));
            }
          }
        } else // Rest becomes rows
        {
          if (firstColumnCount != getColumnCount(row)) // Ignore
          {
            log.warn(
                "Ignoring row which has "
                    + row.getColumnCount()
                    + " columns since table has "
                    + firstColumnCount
                    + " columns");
            continue;
          }

          grid.addRow();

          for (Node c : cells) {
            // TODO row span

            TagNode cell = (TagNode) c;

            grid.addValue(getValue(cell));

            Integer colSpan = MathUtils.parseInt(cell.getAttribute("colspan"));

            if (colSpan != null && colSpan > 1) {
              grid.addEmptyValues((colSpan - 1));
            }
          }
        }
      }

      grids.add(grid);
    }

    return grids;
  }
Example #8
0
  public List<TimeTable> parseTimeTables(String html) {
    Parser parser = new Parser();
    try {
      parser.setInputHTML(html);
      parser.setEncoding("utf-8");
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    List<TimeTable> list = new ArrayList<TimeTable>();
    NodeFilter tagfilter = new NodeClassFilter(TableTag.class);
    NodeFilter idFilter = new HasAttributeFilter("id", "reportArea");
    NodeFilter filter = new AndFilter(tagfilter, idFilter);
    NodeList nodeList = null;
    try {
      nodeList = parser.extractAllNodesThatMatch(filter);
    } catch (ParserException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }
    for (int i = 0; i < nodeList.size(); i++) {
      if (nodeList.elementAt(i) instanceof TableTag) {
        TableTag tag = (TableTag) nodeList.elementAt(i);
        TableRow[] rows = tag.getRows();
        for (int j = 0; j < rows.length; j++) {
          TableRow row = (TableRow) rows[j];
          TableColumn[] columns = row.getColumns();
          boolean isCourse = false;
          TimeTable timeTable = null;

          for (int k = 0; k < columns.length; k++) {
            Node columnNode = columns[k];
            String info = columnNode.toPlainTextString().trim();
            //   System.out.println(info+"=="+k);
            if (k == 1 && info.indexOf("[") != -1) {
              timeTable = new TimeTable();
              String courseCode = info.substring(1, 9);
              String coursesname = info.substring(10);
              timeTable.setCourseName(coursesname);
              timeTable.setCourseCode(courseCode);
              isCourse = true;
            }
            if (k == 2 && isCourse) {
              double credit = Double.parseDouble(info);
              timeTable.setCredit(credit);
            }
            if (k == 3 && isCourse) {
              timeTable.setType(info);
            }
            if (k == 4 && isCourse) {
              timeTable.setTeacher(info);
            }
            if (k == 5 && isCourse) {
              timeTable.setClassId(info);
            }
            if (k == 6 && isCourse) {
              timeTable.setClassNum(info);
            }
            if (k == 11 && isCourse) {
              List<TimeAndAdress> ta_list = praseStr(info);
              for (TimeAndAdress ta : ta_list) {
                timeTable.setAddress(ta.getAddress());
                timeTable.setTime(ta.getTime());
                timeTable.setCycle(ta.getCycle());
                timeTable.setSingleDouble(ta.getSingleDouble());
                timeTable.setWeek(ta.getWeek());
                list.add(timeTable.clone());
              }
            }
          } // end for k
        } // end for j
      }
    }
    return list;
  }