コード例 #1
1
ファイル: Scrape.java プロジェクト: AngelTsanev/AI-project
  public void scrapeInformation() throws IOException, SolrServerException {
    String key = "";
    String value = "";

    for (int i = 9900; i < 10101; i++) {
      Document doc = Jsoup.connect("http://www.auto-data.net/en/?f=showCar&car_id=" + i).get();
      Elements td = doc.select("td");

      for (Element el : td) {
        if (el.childNodeSize() == 1) {
          key = el.ownText();
          continue;
        }

        if (el.childNodeSize() > 1) {
          value = el.getAllElements().select("strong").text();
          if (!key.equals("") && !value.equals("")) {
            car.put(key, value);
            key = "";
            value = "";
          }
        }
      }

      createCar(car, i);

      if ((i % 100) == 0) persistDocuments(cars);
    }
    // persistDocuments(cars); // REMOVE !!!!!!!!!!!!!!!!!!!!!!!!!!!

    return;
  }
コード例 #2
0
    @Override
    protected ArrayList<HashMap<String, String>> doInBackground(Void... params) {
      ArrayList<HashMap<String, String>> authors = new ArrayList<HashMap<String, String>>();
      try {
        char l = 'a';
        while (l <= 'a') {
          URL url = new URL("http://www.liberliber.it/audiolibri/" + l + "/index.htm");
          Document doc = Jsoup.parse(url, 5000);

          Element e = doc.getElementById("riga02_colonna02");
          e = e.getElementsByClass("contenuto_cornice").first();
          e = e.getElementsByTag("tbody").first();
          e = e.getElementsByTag("tr").get(1);
          e = e.getElementsByTag("td").get(1);
          e = e.getElementsByTag("ul").first();

          for (Element curr : e.getElementsByTag("li")) {
            HashMap<String, String> m = new HashMap<String, String>();
            Element el = curr.getAllElements().first();

            m.put("author", el.text());
            m.put("url", el.unwrap().absUrl("href"));

            authors.add(m);
          }

          l++;
        }
      } catch (Exception e) {
        e.printStackTrace();
      }

      return authors;
    }
コード例 #3
0
  public static Pupil getSelectedPupil(Document doc) throws ParseException {

    boolean found = false;
    Pupil p, selectedP = null;

    Elements pupilSelectors =
        doc.getElementsByAttributeValue("id", "ctl00_topMenu_pupil_drdPupils");
    for (Element pupilSelector : pupilSelectors) {

      Elements pupils = pupilSelector.getAllElements();
      for (Element pupil : pupils) {
        if (pupil.tagName().equals("option")) {

          String value = pupil.attr("value");

          found = true;
          if ((p = Pupil.getByFormId(value)) == null) {

            p = new Pupil(pupil.text(), value);
            long rowId = p.insert();

            if (BuildConfig.DEBUG)
              Log.d("GshisHTMLParser", TS.get() + " Pupil.insert() = " + rowId);
          }

          if (pupil.hasAttr("selected") && pupil.attr("selected").equals("selected")) {

            selectedP = p;
          }
        }
      }
    }

    if (!found) {

      if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " Alternative fields found!");

      Element userName = doc.getElementsByClass("user-name").first();
      Element userId = doc.getElementsByAttributeValue("id", "ctl00_topMenu_tbUserId").first();

      String name = userName.text();
      String id = userId.attr("value");

      if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " name=" + name + " id=" + id);

      if ((p = Pupil.getByFormId(id)) == null) {

        p = new Pupil(name, id);
        long rowId = p.insert();

        if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " Pupil.insert() = " + rowId);
      }

      selectedP = p;
    }

    if (selectedP == null) throw new ParseException("Pupils not found", 0);

    return selectedP;
  }
コード例 #4
0
  public static Week getSelectedWeek(Document doc, Schedule s) throws ParseException {

    boolean found = false;
    Week selectedW = null;

    SimpleDateFormat f = new SimpleDateFormat("yyyy dd.MM", Locale.ENGLISH);
    f.setTimeZone(TimeZone.getTimeZone("Europe/Moscow"));

    Elements weekSelectors = doc.getElementsByAttributeValue("id", "ctl00_body_week_drdWeeks");
    for (Element weekSelector : weekSelectors) {

      Elements weeks = weekSelector.getAllElements();
      for (Element week : weeks) {
        if (week.tagName().equals("option")) {

          String value = week.text();
          Week w;
          found = true;

          if ((w = s.getWeek(week.attr("value"))) == null) {

            w = new Week();

            String wBegin = value.substring(0, value.indexOf("-") - 1);
            String wMonth = wBegin.substring(wBegin.indexOf(".") + 1, wBegin.length());

            String year;
            if (Integer.parseInt(wMonth) > 7) {
              year = s.getFormText().substring(0, s.getFormText().indexOf("-") - 1);
            } else {
              year =
                  s.getFormText()
                      .substring(s.getFormText().indexOf("-") + 2, s.getFormText().length());
            }

            w.setStart(f.parse(year + " " + wBegin));
            w.setFormText(week.text());
            w.setFormId(week.attr("value"));

            s.addWeek(w);
          }

          if (week.hasAttr("selected") && week.attr("selected").equals("selected")) {

            selectedW = w;
            long u = w.setLoaded().update();

            if (BuildConfig.DEBUG) Log.d("GshisHTMLParser", TS.get() + " Week.update() = " + u);
          }
        }
      }
    }

    if (!found) throw new ParseException("Weeks not found", 0);

    return selectedW;
  }
コード例 #5
0
ファイル: test.java プロジェクト: yorickdewid/maven-tweets
 private static String getMovedUrl(Element el) {
   String url = "";
   Elements body = el.getAllElements();
   for (Element bodyel : body) {
     if (bodyel.nodeName().equals("a")) {
       url = bodyel.absUrl("href");
     }
   }
   return url;
 }
コード例 #6
0
  public static GradeSemester getActiveGradeSemester(Document doc, Schedule sch)
      throws ParseException {

    boolean found = false;
    GradeSemester selG = null;

    SimpleDateFormat fmt = new SimpleDateFormat("dd.MM.yyyy", Locale.ENGLISH);
    fmt.setTimeZone(TimeZone.getTimeZone("Europe/Moscow"));

    Elements semesterSelectors = doc.getElementsByAttributeValue("id", "ctl00_body_drdTerms");
    for (Element semesterSelector : semesterSelectors) {

      Elements semesters = semesterSelector.getAllElements();
      for (Element semester : semesters) {
        if (semester.tagName().equals("option")) {

          String value = semester.text();
          GradeSemester sem;
          found = true;

          if ((sem = sch.getSemester(semester.attr("value"))) == null) {

            sem = new GradeSemester();

            sem.setStart(fmt.parse(value.substring(12, value.indexOf("-") - 1)));
            sem.setStop(fmt.parse(value.substring(value.indexOf("-") + 2, value.length() - 2)));
            sem.setFormText(semester.text());
            sem.setFormId(semester.attr("value"));

            sch.addSemester(sem);
          }

          if (semester.hasAttr("selected") && semester.attr("selected").equals("selected")) {

            long u = sem.setLoaded().update();
            selG = sem;

            if (BuildConfig.DEBUG) Log.d("GshisHTMLParser", TS.get() + " Semester.update() = " + u);
          }
        }
      }
    }

    if (!found) throw new ParseException("Semesters not found", 0);

    return selG;
  }
コード例 #7
0
  public static Schedule getSelectedSchedule(Document doc, Pupil selPupil) throws ParseException {

    boolean found = false;
    Schedule selectedS = null;

    Elements yearSelectors = doc.getElementsByAttributeValue("id", "ctl00_learnYear_drdLearnYears");
    for (Element yearSelector : yearSelectors) {

      Elements years = yearSelector.getAllElements();
      for (Element year : years) {
        if (year.tagName().equals("option")) {

          String value = year.attr("value");
          Schedule schedule;

          found = true;

          if ((schedule = selPupil.getScheduleByFormId(value)) == null) {

            final SimpleDateFormat f = new SimpleDateFormat("yyyy dd.MM", Locale.ENGLISH);
            f.setTimeZone(TimeZone.getTimeZone("Europe/Moscow"));
            schedule = new Schedule(value, year.text());

            Date start = f.parse(year.text().substring(0, year.text().indexOf("-") - 1) + " 01.09");
            Date stop =
                f.parse(
                    year.text().substring(year.text().indexOf("-") + 2, year.text().length())
                        + " 31.05");

            schedule.setStart(start);
            schedule.setStop(stop);

            selPupil.addSchedule(schedule);
          }

          if (year.hasAttr("selected") && year.attr("selected").equals("selected")) {

            selectedS = schedule;
          }
        }
      }
    }

    if (!found) throw new ParseException("Years not found", 0);

    return selectedS;
  }
コード例 #8
0
  @Override
  public String getArticleText(Document doc) throws BusinessException {
    Elements matchedArticles = doc.select("div." + className);

    if (matchedArticles.size() > 0) {
      Element element = matchedArticles.get(0);
      for (Element el : element.getAllElements()) {
        if (el.className().equalsIgnoreCase("similar_news_box")) {
          el.remove();
        }
      }

      System.out.println(element.text());
      return matchedArticles.get(0).ownText();
    }
    throw new BusinessException();
  }