public void scrapeInformation() throws IOException, SolrServerException { String key = ""; String value = ""; for (int i = 9900; i < 10101; i++) { Document doc = Jsoup.connect("http://www.auto-data.net/en/?f=showCar&car_id=" + i).get(); Elements td = doc.select("td"); for (Element el : td) { if (el.childNodeSize() == 1) { key = el.ownText(); continue; } if (el.childNodeSize() > 1) { value = el.getAllElements().select("strong").text(); if (!key.equals("") && !value.equals("")) { car.put(key, value); key = ""; value = ""; } } } createCar(car, i); if ((i % 100) == 0) persistDocuments(cars); } // persistDocuments(cars); // REMOVE !!!!!!!!!!!!!!!!!!!!!!!!!!! return; }
@Override protected ArrayList<HashMap<String, String>> doInBackground(Void... params) { ArrayList<HashMap<String, String>> authors = new ArrayList<HashMap<String, String>>(); try { char l = 'a'; while (l <= 'a') { URL url = new URL("http://www.liberliber.it/audiolibri/" + l + "/index.htm"); Document doc = Jsoup.parse(url, 5000); Element e = doc.getElementById("riga02_colonna02"); e = e.getElementsByClass("contenuto_cornice").first(); e = e.getElementsByTag("tbody").first(); e = e.getElementsByTag("tr").get(1); e = e.getElementsByTag("td").get(1); e = e.getElementsByTag("ul").first(); for (Element curr : e.getElementsByTag("li")) { HashMap<String, String> m = new HashMap<String, String>(); Element el = curr.getAllElements().first(); m.put("author", el.text()); m.put("url", el.unwrap().absUrl("href")); authors.add(m); } l++; } } catch (Exception e) { e.printStackTrace(); } return authors; }
public static Pupil getSelectedPupil(Document doc) throws ParseException { boolean found = false; Pupil p, selectedP = null; Elements pupilSelectors = doc.getElementsByAttributeValue("id", "ctl00_topMenu_pupil_drdPupils"); for (Element pupilSelector : pupilSelectors) { Elements pupils = pupilSelector.getAllElements(); for (Element pupil : pupils) { if (pupil.tagName().equals("option")) { String value = pupil.attr("value"); found = true; if ((p = Pupil.getByFormId(value)) == null) { p = new Pupil(pupil.text(), value); long rowId = p.insert(); if (BuildConfig.DEBUG) Log.d("GshisHTMLParser", TS.get() + " Pupil.insert() = " + rowId); } if (pupil.hasAttr("selected") && pupil.attr("selected").equals("selected")) { selectedP = p; } } } } if (!found) { if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " Alternative fields found!"); Element userName = doc.getElementsByClass("user-name").first(); Element userId = doc.getElementsByAttributeValue("id", "ctl00_topMenu_tbUserId").first(); String name = userName.text(); String id = userId.attr("value"); if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " name=" + name + " id=" + id); if ((p = Pupil.getByFormId(id)) == null) { p = new Pupil(name, id); long rowId = p.insert(); if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " Pupil.insert() = " + rowId); } selectedP = p; } if (selectedP == null) throw new ParseException("Pupils not found", 0); return selectedP; }
public static Week getSelectedWeek(Document doc, Schedule s) throws ParseException { boolean found = false; Week selectedW = null; SimpleDateFormat f = new SimpleDateFormat("yyyy dd.MM", Locale.ENGLISH); f.setTimeZone(TimeZone.getTimeZone("Europe/Moscow")); Elements weekSelectors = doc.getElementsByAttributeValue("id", "ctl00_body_week_drdWeeks"); for (Element weekSelector : weekSelectors) { Elements weeks = weekSelector.getAllElements(); for (Element week : weeks) { if (week.tagName().equals("option")) { String value = week.text(); Week w; found = true; if ((w = s.getWeek(week.attr("value"))) == null) { w = new Week(); String wBegin = value.substring(0, value.indexOf("-") - 1); String wMonth = wBegin.substring(wBegin.indexOf(".") + 1, wBegin.length()); String year; if (Integer.parseInt(wMonth) > 7) { year = s.getFormText().substring(0, s.getFormText().indexOf("-") - 1); } else { year = s.getFormText() .substring(s.getFormText().indexOf("-") + 2, s.getFormText().length()); } w.setStart(f.parse(year + " " + wBegin)); w.setFormText(week.text()); w.setFormId(week.attr("value")); s.addWeek(w); } if (week.hasAttr("selected") && week.attr("selected").equals("selected")) { selectedW = w; long u = w.setLoaded().update(); if (BuildConfig.DEBUG) Log.d("GshisHTMLParser", TS.get() + " Week.update() = " + u); } } } } if (!found) throw new ParseException("Weeks not found", 0); return selectedW; }
private static String getMovedUrl(Element el) { String url = ""; Elements body = el.getAllElements(); for (Element bodyel : body) { if (bodyel.nodeName().equals("a")) { url = bodyel.absUrl("href"); } } return url; }
public static GradeSemester getActiveGradeSemester(Document doc, Schedule sch) throws ParseException { boolean found = false; GradeSemester selG = null; SimpleDateFormat fmt = new SimpleDateFormat("dd.MM.yyyy", Locale.ENGLISH); fmt.setTimeZone(TimeZone.getTimeZone("Europe/Moscow")); Elements semesterSelectors = doc.getElementsByAttributeValue("id", "ctl00_body_drdTerms"); for (Element semesterSelector : semesterSelectors) { Elements semesters = semesterSelector.getAllElements(); for (Element semester : semesters) { if (semester.tagName().equals("option")) { String value = semester.text(); GradeSemester sem; found = true; if ((sem = sch.getSemester(semester.attr("value"))) == null) { sem = new GradeSemester(); sem.setStart(fmt.parse(value.substring(12, value.indexOf("-") - 1))); sem.setStop(fmt.parse(value.substring(value.indexOf("-") + 2, value.length() - 2))); sem.setFormText(semester.text()); sem.setFormId(semester.attr("value")); sch.addSemester(sem); } if (semester.hasAttr("selected") && semester.attr("selected").equals("selected")) { long u = sem.setLoaded().update(); selG = sem; if (BuildConfig.DEBUG) Log.d("GshisHTMLParser", TS.get() + " Semester.update() = " + u); } } } } if (!found) throw new ParseException("Semesters not found", 0); return selG; }
public static Schedule getSelectedSchedule(Document doc, Pupil selPupil) throws ParseException { boolean found = false; Schedule selectedS = null; Elements yearSelectors = doc.getElementsByAttributeValue("id", "ctl00_learnYear_drdLearnYears"); for (Element yearSelector : yearSelectors) { Elements years = yearSelector.getAllElements(); for (Element year : years) { if (year.tagName().equals("option")) { String value = year.attr("value"); Schedule schedule; found = true; if ((schedule = selPupil.getScheduleByFormId(value)) == null) { final SimpleDateFormat f = new SimpleDateFormat("yyyy dd.MM", Locale.ENGLISH); f.setTimeZone(TimeZone.getTimeZone("Europe/Moscow")); schedule = new Schedule(value, year.text()); Date start = f.parse(year.text().substring(0, year.text().indexOf("-") - 1) + " 01.09"); Date stop = f.parse( year.text().substring(year.text().indexOf("-") + 2, year.text().length()) + " 31.05"); schedule.setStart(start); schedule.setStop(stop); selPupil.addSchedule(schedule); } if (year.hasAttr("selected") && year.attr("selected").equals("selected")) { selectedS = schedule; } } } } if (!found) throw new ParseException("Years not found", 0); return selectedS; }
@Override public String getArticleText(Document doc) throws BusinessException { Elements matchedArticles = doc.select("div." + className); if (matchedArticles.size() > 0) { Element element = matchedArticles.get(0); for (Element el : element.getAllElements()) { if (el.className().equalsIgnoreCase("similar_news_box")) { el.remove(); } } System.out.println(element.text()); return matchedArticles.get(0).ownText(); } throw new BusinessException(); }