Ejemplo n.º 1
0
  private void selectData(String sql) {
    try {
      String unavailableURL = "";
      String update = "";
      Statement st = conn.createStatement();
      ResultSet rs = st.executeQuery(sql);
      while (rs.next()) {
        String url = urlProcess(rs.getString(1));
        if (url == null || url.length() == 0) {
          unavailableURL += rs.getString(1) + "\r\n";
          continue;
        }
        Elements type = null;
        if (urls.containsKey(url)) {
          type = urls.get(url);
        } else {
          System.out.println(url);
          Document doc =
              Jsoup.connect("https://www.dmoz.org/search?q=" + url).timeout(100 * 1000).get();
          type = doc.select("#bd-cross ol li a strong");
          urls.put(url, type);
        }
        if (type.toArray().length == 0) {
          unavailableURL += rs.getString(1) + "\r\n";
          continue;
        }
        String out = "";
        for (Object ele : type.toArray()) {
          out += ele.toString().substring(8, ele.toString().length() - 9) + "#";
        }
        out = out.substring(0, out.length() - 1);

        update +=
            "UPDATE final_s_t_l2 tb1 SET tb1.typeOfODP=\""
                + out
                + "\" WHERE tb1.urllong=\""
                + rs.getString(1)
                + "\";\r\n";
      }
      writeAppend(unavailableURL, "out\\unavailableURL.txt");
      writeAppend(update, "out\\updateURL.txt");

    } catch (Exception e) {
      e.printStackTrace();
    }
  }
Ejemplo n.º 2
0
  public void parse(String path) {

    Document htmlFile;
    try {
      File[] files =
          new File(path)
              .listFiles(
                  (dir, name) -> {
                    return name.endsWith(".html");
                  });
      for (File file : files) {
        if (file.isFile()) {
          htmlFile = Jsoup.parse(file, "UTF-8");
          // Get Country
          String currentCountryCode = null;
          Element element = htmlFile.select(SUMMARY_CLASS).first();
          Country country = new Country();
          for (Element line : element.select(SUMMARY_LINE)) {
            String name = line.select(SUMMARY_NAME).first().text();
            String value = line.select(SUMMARY_VALUE).first().text();
            switch (name) {
              case "Alpha-2 code":
                country.Code2 = value;
                currentCountryCode = value;
                break;
              case "Short name":
                country.ShortName = value;
                break;
              case "Short name lower case":
                country.ShortNameLC = value.replace("*", "");
                break;
              case "Full name":
                country.FullName = value.replace("*", "");
                break;
              case "Alpha-3 code":
                country.Code3 = value;
                break;
              case "Numeric code":
                country.NumCode = value;
                break;
              default:
            }
          }
          countryHashMap.put(country.Code2, country);
          // Get Divisions
          Element divisions = htmlFile.getElementById(DIVISION_ID);
          Element table = divisions.select(TBODY).first();
          for (Element row : table.select("tr")) {
            Elements cells = row.select("td");
            if (cells.size() > 6) {
              throw new Exception("Cell count greater than expected.");
            }
            Element[] cellArray = cells.toArray(new Element[6]);
            Region region = new Region();
            for (int x = 0; x < 6; x++) {
              if (x == 0) {
                // Get type name add to list.
                String regionTypeName = capitalize(cellArray[x].text());
                if (!typeHashMap.containsKey(regionTypeName)) {
                  RegionType regionType = new RegionType(capitalize(regionTypeName));
                  typeHashMap.put(regionTypeName, regionType);
                }
                region.categoryName = regionTypeName;
              }
              if (x == 1) {
                // Get region code
                region.code = cellArray[x].text().replace("*", "");
              }
              if (x == 2) {
                // Get region name
                region.name = cellArray[x].text();
              }
              if (x == 5) {
                // Get region parent
                String parentCode = cellArray[x].text();
                if (!parentCode.isEmpty()) {
                  region.parentCode = parentCode;
                }
              }
            }
            if (currentCountryCode == null || currentCountryCode.isEmpty()) {
              throw new Exception("Country code is null or empty.");
            }
            region.countryCode = currentCountryCode;
            if (!regionHashMap.containsKey(region.code)) {
              regionHashMap.put(region.code, region);
            }
          }
        }
      }
    } catch (Exception e) {
      e.printStackTrace();
    }
  }