private SectorsIndustriesCompanies parseUrlRecursive(String url, List<String> errors) { SectorsIndustriesCompanies ret = new SectorsIndustriesCompanies(); Map<Sector, String> urls = parseSectorPage(url, errors, SECTOR); // sectors page has a "Sector" on the header int lastSlash = url.lastIndexOf("/"); String root = url.substring(0, lastSlash + 1); for (Sector s : urls.keySet()) { String sectorUrl = root + urls.get(s); log.info("Getting industries and companies for sector: " + s.getDescription()); Map<Sector, String> industriesUrls = parseSectorPage( sectorUrl, errors, DESCRIPTION); // industries pages have a "Description" instead of "Sector" ret.addIndustriesToSector(s, industriesUrls.keySet()); for (Sector industry : industriesUrls.keySet()) { if (industry != null) { String industryUrl = root + industriesUrls.get(industry); Map<Sector, String> companiesUrl = parseSectorPage( industryUrl, errors, DESCRIPTION); // companies pages have a "Description" instead of "Sector" for (Sector cmpnyAsSector : companiesUrl.keySet()) { String desc = cmpnyAsSector.getDescription(); if (!desc.startsWith("Sector:") && !desc.startsWith("Industry:")) { try { // String companyUrl = companiesUrl.get(cmpnyAsSector); Company company = new Company(); int st = desc.lastIndexOf("("); int et = 0; if (st > 0) { et = st - 1; } String name = desc.substring(0, et); String ticker = desc.substring(st + 1, desc.length() - 1); // String[] tokens = desc.split("\\("); // String name = tokens[0]; // String[] another = tokens[1].split("\\)"); // String ticker = another[0]; company.setName(name); company.setTicker(ticker); ret.addCompanyToIndustry(industry, company); } catch (Throwable t) { log.error(industryUrl + " : cannot split this: " + desc, t); } } } } } } return ret; }
private Map<Sector, String> parseSectorPage( String url, List<String> errors, String sectorOrDescription) { Map<Sector, String> urls = new ConcurrentSkipListMap<>(); Document document; try { document = Jsoup.connect(url) .header("Accept-Encoding", "gzip, deflate") .userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0") .maxBodySize(0) .timeout(600000) .get(); Element e = document.select("table").get(3); Set<String> headersFound = new HashSet<String>(); Elements rows = e.select("tr"); for (Element row : rows) { Elements hdrs = e.select("th"); int i = 0; boolean foundTable = false; for (Element h : hdrs) { String hd = webParsingUtils.advanceTrim(h.text()); if (!MORE_INFO.equals(hd)) { headersFound.add(hd); foundTable = checkExpectedHeader(i++, hd, sectorOrDescription); } } if (foundTable) { Elements sectorElements = row.select("tr"); for (Element sctElemt : sectorElements) { Elements tr = sctElemt.select("td"); if (!tr.isEmpty()) { Sector s = new Sector(); if (tr.size() == 1) { continue; } Element sector = tr.get(0); try { s.setDescription(sector.text()); // "Sector" s.setOneDayPriceChangePercent(parseDouble(tr.get(1))); // "1 Day Price Change %" s.setMarketCap(parseDouble(tr.get(2))); // "Market Cap" s.setPeRatio(parseDouble(tr.get(3))); // "P/E" s.setRoePercent(parseDouble(tr.get(4))); // "ROE %" s.setDividendYield(parseDouble(tr.get(5))); // "Div. Yield %" s.setLongTermDebtToEquity(parseDouble(tr.get(6))); // "Long-Term Debt to Equity" s.setPriceToBookValue(parseDouble(tr.get(7))); // "Price to Book Value" s.setNetProfitMarginPercent(parseDouble(tr.get(8))); // "Net Profit Margin % (mrq)" s.setPriceToFreeCashFlow(parseDouble(tr.get(9))); // "Price to Free Cash Flow (mrq)" } catch (NumberFormatException nfe) { log.error("Cannot parse " + url + " , row: " + tr, nfe); } Elements sectorUrl = sector.select("a"); if (!sectorUrl.isEmpty()) { String childUrl = sectorUrl.get(0).attr("href"); urls.put(s, childUrl); } } } } } // validate headers: for (String h : headersExpected) { if (!headersFound.contains(h)) { errors.add("Cannot find expected entry: " + h + ", found entries:" + headersFound); } } } catch (MalformedURLException e) { log.error("Unexpected IO error while getting list of companies and sectors ", e); } catch (IOException e1) { log.error("Unexpected IO error while getting list of companies and sectors ", e1); } catch (Throwable t) { log.error("Unexpected error while getting list of companies and sectors ", t); } return urls; }