public JobData parse(Document doc) { String keyword = doc.select("meta[name=keywords]").first().attr("content"); String[] keywordArr = keyword.split(","); String uniqueId = extractUniqueId(doc); String companyName = keywordArr[0]; String jobTitle = keywordArr[1]; List<String> cityList = extractCities(doc); Date date = extractDate(doc); String summary = summary(doc); JobData jobData = null; try { jobData = new JobData( trim(SOURCE), trim(uniqueId), trim(companyName), trim(jobTitle), cityList, trim(summary), date, trim(doc.baseUri()), ""); } catch (Exception ex) { System.err.println(doc.baseUri() + " , " + ex.getMessage()); } return jobData; // To change body of implemented methods use File | Settings | File Templates. }
public void index(final String title, String wikitext, Document doc) { try { final String hash = new StringHash(wikitext).hash(); // parse input with JSOUP class Walker { private int secIndex; Walker(int secIndex) { this.secIndex = secIndex; } public int walk(Element el) { Elements children = el.children(); String tagName = el.tagName().toLowerCase(); if (tagName.matches("h[1-6]")) { secIndex++; String secName = el.text(); String key = "SectionTitle" + "/" + title + "/" + hash + "/" + new Integer(secIndex).toString(); String value = secNameFilter(secName); dataStore.put(key, value); } for (Element child : children) { secIndex = (new Walker(secIndex)).walk(child); } return secIndex; } } (new Walker(0)).walk(doc.body()); } catch (Exception e) { System.err.println( "SectionTitleIndexer.index failed due to exception: '" + e.toString() + "'"); } }