private void selectData(String sql) { try { String unavailableURL = ""; String update = ""; Statement st = conn.createStatement(); ResultSet rs = st.executeQuery(sql); while (rs.next()) { String url = urlProcess(rs.getString(1)); if (url == null || url.length() == 0) { unavailableURL += rs.getString(1) + "\r\n"; continue; } Elements type = null; if (urls.containsKey(url)) { type = urls.get(url); } else { System.out.println(url); Document doc = Jsoup.connect("https://www.dmoz.org/search?q=" + url).timeout(100 * 1000).get(); type = doc.select("#bd-cross ol li a strong"); urls.put(url, type); } if (type.toArray().length == 0) { unavailableURL += rs.getString(1) + "\r\n"; continue; } String out = ""; for (Object ele : type.toArray()) { out += ele.toString().substring(8, ele.toString().length() - 9) + "#"; } out = out.substring(0, out.length() - 1); update += "UPDATE final_s_t_l2 tb1 SET tb1.typeOfODP=\"" + out + "\" WHERE tb1.urllong=\"" + rs.getString(1) + "\";\r\n"; } writeAppend(unavailableURL, "out\\unavailableURL.txt"); writeAppend(update, "out\\updateURL.txt"); } catch (Exception e) { e.printStackTrace(); } }
public void parse(String path) { Document htmlFile; try { File[] files = new File(path) .listFiles( (dir, name) -> { return name.endsWith(".html"); }); for (File file : files) { if (file.isFile()) { htmlFile = Jsoup.parse(file, "UTF-8"); // Get Country String currentCountryCode = null; Element element = htmlFile.select(SUMMARY_CLASS).first(); Country country = new Country(); for (Element line : element.select(SUMMARY_LINE)) { String name = line.select(SUMMARY_NAME).first().text(); String value = line.select(SUMMARY_VALUE).first().text(); switch (name) { case "Alpha-2 code": country.Code2 = value; currentCountryCode = value; break; case "Short name": country.ShortName = value; break; case "Short name lower case": country.ShortNameLC = value.replace("*", ""); break; case "Full name": country.FullName = value.replace("*", ""); break; case "Alpha-3 code": country.Code3 = value; break; case "Numeric code": country.NumCode = value; break; default: } } countryHashMap.put(country.Code2, country); // Get Divisions Element divisions = htmlFile.getElementById(DIVISION_ID); Element table = divisions.select(TBODY).first(); for (Element row : table.select("tr")) { Elements cells = row.select("td"); if (cells.size() > 6) { throw new Exception("Cell count greater than expected."); } Element[] cellArray = cells.toArray(new Element[6]); Region region = new Region(); for (int x = 0; x < 6; x++) { if (x == 0) { // Get type name add to list. String regionTypeName = capitalize(cellArray[x].text()); if (!typeHashMap.containsKey(regionTypeName)) { RegionType regionType = new RegionType(capitalize(regionTypeName)); typeHashMap.put(regionTypeName, regionType); } region.categoryName = regionTypeName; } if (x == 1) { // Get region code region.code = cellArray[x].text().replace("*", ""); } if (x == 2) { // Get region name region.name = cellArray[x].text(); } if (x == 5) { // Get region parent String parentCode = cellArray[x].text(); if (!parentCode.isEmpty()) { region.parentCode = parentCode; } } } if (currentCountryCode == null || currentCountryCode.isEmpty()) { throw new Exception("Country code is null or empty."); } region.countryCode = currentCountryCode; if (!regionHashMap.containsKey(region.code)) { regionHashMap.put(region.code, region); } } } } } catch (Exception e) { e.printStackTrace(); } }