private Map<Sector, String> parseSectorPage( String url, List<String> errors, String sectorOrDescription) { Map<Sector, String> urls = new ConcurrentSkipListMap<>(); Document document; try { document = Jsoup.connect(url) .header("Accept-Encoding", "gzip, deflate") .userAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0") .maxBodySize(0) .timeout(600000) .get(); Element e = document.select("table").get(3); Set<String> headersFound = new HashSet<String>(); Elements rows = e.select("tr"); for (Element row : rows) { Elements hdrs = e.select("th"); int i = 0; boolean foundTable = false; for (Element h : hdrs) { String hd = webParsingUtils.advanceTrim(h.text()); if (!MORE_INFO.equals(hd)) { headersFound.add(hd); foundTable = checkExpectedHeader(i++, hd, sectorOrDescription); } } if (foundTable) { Elements sectorElements = row.select("tr"); for (Element sctElemt : sectorElements) { Elements tr = sctElemt.select("td"); if (!tr.isEmpty()) { Sector s = new Sector(); if (tr.size() == 1) { continue; } Element sector = tr.get(0); try { s.setDescription(sector.text()); // "Sector" s.setOneDayPriceChangePercent(parseDouble(tr.get(1))); // "1 Day Price Change %" s.setMarketCap(parseDouble(tr.get(2))); // "Market Cap" s.setPeRatio(parseDouble(tr.get(3))); // "P/E" s.setRoePercent(parseDouble(tr.get(4))); // "ROE %" s.setDividendYield(parseDouble(tr.get(5))); // "Div. Yield %" s.setLongTermDebtToEquity(parseDouble(tr.get(6))); // "Long-Term Debt to Equity" s.setPriceToBookValue(parseDouble(tr.get(7))); // "Price to Book Value" s.setNetProfitMarginPercent(parseDouble(tr.get(8))); // "Net Profit Margin % (mrq)" s.setPriceToFreeCashFlow(parseDouble(tr.get(9))); // "Price to Free Cash Flow (mrq)" } catch (NumberFormatException nfe) { log.error("Cannot parse " + url + " , row: " + tr, nfe); } Elements sectorUrl = sector.select("a"); if (!sectorUrl.isEmpty()) { String childUrl = sectorUrl.get(0).attr("href"); urls.put(s, childUrl); } } } } } // validate headers: for (String h : headersExpected) { if (!headersFound.contains(h)) { errors.add("Cannot find expected entry: " + h + ", found entries:" + headersFound); } } } catch (MalformedURLException e) { log.error("Unexpected IO error while getting list of companies and sectors ", e); } catch (IOException e1) { log.error("Unexpected IO error while getting list of companies and sectors ", e1); } catch (Throwable t) { log.error("Unexpected error while getting list of companies and sectors ", t); } return urls; }