public static void main(String[] args) throws IOException {

    // Pobranie pliku HTML
    Document doc = Jsoup.connect(url).get();
    // Wybieramy TAGI: TABLE gdzie width=720 i nie te dla których border=0
    Elements media = doc.select("table[width=720]").not("[border=0]");

    System.out.println("TABLES: " + media.size()); // jest 6 tabel

    // Interesują nas tylko te tabelki dla których:
    // pierwszy i ostatni rzÄ…d (TR) jest taki sam...
    for (Element src : media) { // przelatuje po tabelach (szeœciu)
      System.out.println("--------------------------------");
      Elements media2 = src.select("tr");
      System.out.println("TR: " + media2.size());
      Element first = media2.get(0);
      // System.out.println(media2.get(0).select("td").size());
      String sFirst = first.text();
      // System.out.println(first);
      System.out.println("text=" + sFirst);
      Element last = media2.get(media2.size() - 1);
      // System.out.println(media2.get(media2.size()-1).select("td").size());
      String sLast = last.text();
      // System.out.println(last);
      System.out.println("text=" + sLast);
      System.out.println(sFirst.equals(sLast));

      if (sFirst.equals(sLast)) {
        System.out.println(
            "data zmiany: " + sFirst.split("last updated", 2)[1].split(" - ")[0].trim());
        // System.out.println(media2.size());
        media2.remove(0);
        media2.remove(media2.size() - 1);
        System.out.println("ROWS: " + media2.size());
      } else {
        System.out.println("NOT OK");
      }
    }
  }
  public static void main(String args[]) throws IOException {
    // Element.ownText()

    // Step 1: To extract all labels and instances...
    Document doc = Jsoup.connect("http://127.0.0.1/master%20project/websites/home.php").get();
    Elements labelElements = doc.getElementsByAttributeValue("id", "label");
    Elements instanceElements = doc.getElementsByAttributeValue("id", "instance");

    // Step 2: To pair C(l,i) using single link clustering algorithm... NOTE: special Date case...
    HashMap<String, String[]> singleLinkClusterMap = new HashMap<String, String[]>();
    // singleLinkClusterMap.put(key, value)

    for (int i = 0; i < labelElements.size(); i++) {
      // Keys...
      String key = labelElements.select("[tag=" + i + "]").text();

      if (!key.toLowerCase().equals("date")) {
        // Values...
        Elements instanceElementsForThisKey = instanceElements.select("[tag=" + i + "]");
        String[] values = new String[instanceElementsForThisKey.size()];
        for (int j = 0; j < instanceElementsForThisKey.size(); j++) {
          values[j] = instanceElementsForThisKey.remove(0).text();
        }
        singleLinkClusterMap.put(key, values);
      } else {
        Date date = new Date();
        String modifiedDate = new SimpleDateFormat("yyyy-MM-dd").format(date);
        String[] values = {modifiedDate.toString()};
        singleLinkClusterMap.put(key, values);
      }
    }

    System.out.println("label:" + singleLinkClusterMap);

    // Step 3: To create base Ontology

    // TEST: to fire the source page with a request query then extract the data from resulting
    // page...

    // Step 4: To create one(or more) slave to which the base ontology and interface website address
    // is sent to.
    // This(These) slaves will then repeat steps 1 and 2 then create their own Ontology O'

    // Step 5: The new ontology O' will then be sent back to the Master to merge with original O.
  }
  public JSONArray toRainChanceJSON(String html, String[] labels) {
    Document doc = Jsoup.parse(html);
    JSONArray times = new JSONArray();

    try {
      Element table = doc.select("table").first();
      Elements rows = table.select("tr");

      for (Element row : rows) {
        Elements data = row.select("td");
        data.remove(0);
        if (!data.isEmpty()) {
          JSONObject details = new JSONObject();

          String str = data.get(0).text();
          String delims = "[ ]";
          String[] tokens = str.split(delims);
          details.put(labels[0], tokens[1]);

          str = data.get(1).text();
          delims = "[()]";
          tokens = str.split(delims);
          Log.d("jsoup", "str: " + str);
          details.put(labels[1], tokens[1]);

          // for(Element dataItem : data){
          //	details.put(labels[data.indexOf(dataItem)], dataItem.text());
          // }
          times.put(details);
        }
      }

    } catch (JSONException e) {
      e.printStackTrace();
    }

    // Log.d("jsoup", times.toString());
    // return s;

    return times;
  }
  public static String updateAFGXml(boolean isActivate, String target, String ectXml) {
    String conditionStr =
        isActivate ? "<cp:conditions/>" : "<cp:conditions><ss:rule-deactivated/></cp:conditions>";
    Document doc = Jsoup.parse(ectXml, "UTF-8");
    Elements ruleAudio = doc.select("cp|rule[id=cfu] ");

    Elements ruleAudioCondition = ruleAudio.select("cp|conditions");
    ruleAudioCondition.remove(); // we cant change it to "<cp:conditions/> directly
    ruleAudio.prepend(conditionStr);

    Elements ruleAudioForwardTarget = ruleAudio.select("ss|forward-to>ss|target");
    ruleAudioForwardTarget.html(target);

    String r = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n";
    r += doc.getElementsByTag("ss:simservs").outerHtml();

    // modify for jsoup problem
    r = r.replaceAll("noreplytimer", "NoReplyTimer");
    // r= r.replaceAll("\n", "");
    r = r.replaceAll(">\\s+(.+)\\s+<", ">$1<");

    return r;
  }
Beispiel #5
0
  @Override
  public HNFeed parseDocument(Document doc) throws Exception {
    if (doc == null) return new HNFeed();

    ArrayList<HNPost> posts = new ArrayList<HNPost>();

    // clumsy, but hopefully stable query - first element retrieved is the
    // top table, we have to skip that:
    Elements tableRows = doc.select("table tr table tr");
    tableRows.remove(0);

    Elements nextPageURLElements = tableRows.select("a:matches(More)");
    String nextPageURL = null;
    if (nextPageURLElements.size() > 0)
      nextPageURL = resolveRelativeHNURL(nextPageURLElements.attr("href"));

    String url = null;
    String title = null;
    String author = null;
    int commentsCount = 0;
    int points = 0;
    String urlDomain = null;
    String postID = null;

    boolean endParsing = false;
    for (int row = 0; row < tableRows.size(); row++) {
      int rowInPost = row % 3;
      Element rowElement = tableRows.get(row);

      switch (rowInPost) {
        case 0:
          Element e1 = rowElement.select("tr > td:eq(2) > a").first();
          if (e1 == null) {
            endParsing = true;
            break;
          }

          title = e1.text();
          url = resolveRelativeHNURL(e1.attr("href"));
          urlDomain = getDomainName(url);
          break;
        case 1:
          points =
              getIntValueFollowedBySuffix(rowElement.select("tr > td:eq(1) > span").text(), " p");
          author = rowElement.select("tr > td:eq(1) > a[href*=user]").text();
          Element e2 = rowElement.select("tr > td:eq(1) > a[href*=item]").first();
          if (e2 != null) {
            commentsCount = getIntValueFollowedBySuffix(e2.text(), " c");
            if (commentsCount == BaseHTMLParser.UNDEFINED && e2.text().contains("discuss"))
              commentsCount = 0;
            postID = getStringValuePrefixedByPrefix(e2.attr("href"), "id=");
          } else commentsCount = BaseHTMLParser.UNDEFINED;

          posts.add(new HNPost(url, title, urlDomain, author, postID, commentsCount, points));
          break;
        default:
          break;
      }

      if (endParsing) break;
    }

    return new HNFeed(posts, nextPageURL);
  }
  @Override
  protected void onHandleIntent(Intent intent) {
    NotificationCompat.Builder mBuilder =
        new NotificationCompat.Builder(this)
            .setSmallIcon(R.drawable.ic_notification)
            .setProgress(100, 0, false)
            .setOngoing(true)
            .setContentTitle(getResources().getString(R.string.loading_offline_whatif))
            .setAutoCancel(true);

    NotificationManager mNotificationManager =
        (NotificationManager) getSystemService(NOTIFICATION_SERVICE);
    mNotificationManager.notify(1, mBuilder.build());

    PrefHelper prefHelper = new PrefHelper(getApplicationContext());
    File sdCard = prefHelper.getOfflinePath();
    File dir = new File(sdCard.getAbsolutePath() + OFFLINE_WHATIF_OVERVIEW_PATH);
    OkHttpClient client = new OkHttpClient();
    Document doc;
    if (!dir.exists()) dir.mkdirs();
    // download overview
    if (!BuildConfig.DEBUG) {
      try {
        doc =
            Jsoup.connect("https://what-if.xkcd.com/archive/")
                .userAgent(
                    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.19 Safari/537.36")
                .get();
        StringBuilder sb = new StringBuilder();
        Elements titles = doc.select("h1");
        prefHelper.setNewestWhatif(titles.size());

        sb.append(titles.first().text());
        titles.remove(0);
        for (Element title : titles) {
          sb.append("&&");
          sb.append(title.text());
        }
        prefHelper.setWhatIfTitles(sb.toString());

        Elements img = doc.select("img.archive-image");
        int count = 1;
        for (Element image : img) {
          String url = image.absUrl("src");
          try {
            Request request = new Request.Builder().url(url).build();
            Response response = client.newCall(request).execute();
            File file = new File(dir, String.valueOf(count) + ".png");
            BufferedSink sink = Okio.buffer(Okio.sink(file));
            sink.writeAll(response.body().source());
            sink.close();
            response.body().close();
          } catch (Exception e) {
            e.printStackTrace();
          }
          int p = (int) (count / ((float) img.size()) * 100);
          mBuilder.setProgress(100, p, false);
          mNotificationManager.notify(1, mBuilder.build());
          count++;
        }
        if (prefHelper.getNewestWhatIf() == 0) prefHelper.setNewestWhatif(count - 1);
      } catch (IOException e) {
        e.printStackTrace();
      }

      // download html
      int size = prefHelper.getNewestWhatIf();
      for (int i = 1; i <= size; i++) {
        try {
          doc = Jsoup.connect("https://what-if.xkcd.com/" + String.valueOf(i)).get();
          dir = new File(sdCard.getAbsolutePath() + OFFLINE_WHATIF_PATH + String.valueOf(i));
          dir.mkdirs();
          File file = new File(dir, String.valueOf(i) + ".html");
          BufferedWriter writer = new BufferedWriter(new FileWriter(file));
          writer.write(doc.outerHtml());
          writer.close();
          // download images
          int count = 1;
          for (Element e : doc.select(".illustration")) {
            try {
              String url = "http://what-if.xkcd.com" + e.attr("src");
              Request request = new Request.Builder().url(url).build();
              Response response = client.newCall(request).execute();
              dir = new File(sdCard.getAbsolutePath() + OFFLINE_WHATIF_PATH + String.valueOf(i));
              if (!dir.exists()) dir.mkdirs();
              file = new File(dir, String.valueOf(count) + ".png");
              BufferedSink sink = Okio.buffer(Okio.sink(file));
              sink.writeAll(response.body().source());
              sink.close();
              response.body().close();
              count++;
            } catch (Exception e2) {
              Log.e("article" + i, e2.getMessage());
            }
          }
          int p = (int) (i / ((float) size) * 100);
          mBuilder.setProgress(100, p, false);
          mBuilder.setContentText(i + "/" + size);
          mNotificationManager.notify(1, mBuilder.build());
        } catch (Exception e) {
          Log.e("article" + i, e.getMessage());
        }
      }
    }
    prefHelper.setSunbeamLoaded();

    Intent restart = new Intent("de.tap.easy_xkcd.ACTION_COMIC");
    restart.putExtra("number", prefHelper.getLastComic());
    PendingIntent pendingIntent =
        PendingIntent.getActivity(this, 1, restart, PendingIntent.FLAG_UPDATE_CURRENT);
    mBuilder
        .setContentIntent(pendingIntent)
        .setContentText(getResources().getString(R.string.not_restart));
    mNotificationManager.notify(1, mBuilder.build());
  }
 /**
  * Получение группы с расписанием
  *
  * @param schedule
  * @return group
  */
 public static Group getSchedule(Element schedule) {
   Elements trs = schedule.getElementsByTag(Constants.TR);
   trs.remove(0);
   trs.remove(0);
   int numberOfDay = 0;
   List<Lesson> lessonList = new ArrayList<Lesson>();
   Long parity = null;
   for (Element tr : trs) {
     int checkParity = 0;
     if (tr.getElementsByTag("img").size() != 0) {
       if (tr.getElementsByTag("img").first().attr("src").equals("data/1.gif")) {
         parity = new Long(2);
       } else {
         parity = new Long(1);
       }
       checkParity = 1;
       numberOfDay = 1;
     }
     Elements tds = tr.getElementsByTag(Constants.TD);
     int i = 1;
     if (checkParity == 1) {
       tds.remove(0);
     }
     tds.remove(0);
     for (Element td : tds) {
       String time = getRings().get(i);
       String timeStart = time.split("-")[0];
       String timeEnd = time.split("-")[1];
       if (!td.text().replaceAll("\\u00A0", "").equals("")) {
         String subElement = td.text();
         int type = CommonUtils.getTypeOfLesson(getType(subElement));
         String sub = subElement.replace(getType(subElement), "");
         String subSplit = null;
         if (sub.matches("(.*)ест\\.(.*)")) {
           subSplit =
               sub.replaceAll("в проф\\.деят-сти", "в профессиональной деятельности")
                   .replaceAll("Дальневост.рег", "Дальневосточного региона")
                   .replace("а. сз", "а.сз")
                   .replaceAll("асс\\.", ";")
                   .replaceAll("ст\\.пр\\.", ";")
                   .replaceAll("доц\\.", ";")
                   .replaceAll("проф\\.", ";");
         } else {
           subSplit =
               sub.replaceAll("в проф\\.деят-сти", "в профессиональной деятельности")
                   .replaceAll("Дальневост.рег", "Дальневосточного региона")
                   .replace("а. сз", "а.сз")
                   .replaceAll("асс\\.", ";")
                   .replaceAll("ст\\.пр\\.", ";")
                   .replaceAll("ст\\.", ";")
                   .replaceAll("доц\\.", ";")
                   .replaceAll("проф\\.", ";");
         }
         String subList[] = subSplit.split(";");
         String subject, sbj;
         List<String> teachersAuditories = new ArrayList<String>();
         if (subList.length == 1) {
           sbj = subList[0].substring(0, subList[0].lastIndexOf(" ")).trim().toLowerCase();
           subject = sbj.substring(0, 1).toUpperCase().concat(sbj.substring(1));
           teachersAuditories.add(
               "null;".concat(subList[0].substring(subList[0].lastIndexOf(" ") + 1)));
         } else {
           sbj = subList[0].trim().toLowerCase();
           subject = sbj.substring(0, 1).toUpperCase().concat(sbj.substring(1));
           if (subList.length == 2) {
             teachersAuditories.add(
                 subList[1]
                     .substring(0, subList[1].lastIndexOf(" "))
                     .trim()
                     .concat(";")
                     .concat(subList[1].substring(subList[1].lastIndexOf(" ") + 1)));
           } else if (subList.length == 3) {
             teachersAuditories.add(
                 subList[1]
                     .trim()
                     .substring(0, subList[1].trim().lastIndexOf(" "))
                     .trim()
                     .concat(";")
                     .concat(subList[1].trim().substring(subList[1].trim().lastIndexOf(" ") + 1)));
             if (subList[2].contains("-")) {
               teachersAuditories.add(
                   subList[2]
                       .trim()
                       .substring(0, subList[2].trim().lastIndexOf("-"))
                       .trim()
                       .concat(";")
                       .concat(
                           subList[2]
                               .trim()
                               .substring(subList[2].trim().lastIndexOf("-") + 1)
                               .trim()));
             } else {
               teachersAuditories.add(
                   subList[2]
                       .trim()
                       .substring(0, subList[2].trim().lastIndexOf(" "))
                       .trim()
                       .concat(";")
                       .concat(
                           subList[2].trim().substring(subList[2].trim().lastIndexOf(" ") + 1)));
             }
           }
         }
         for (String teacherAuditory : teachersAuditories) {
           Lesson lesson =
               new Lesson(
                   subject,
                   numberOfDay + "",
                   null,
                   timeStart,
                   timeEnd,
                   type,
                   teacherAuditory.split(";")[1],
                   null,
                   teacherAuditory.split(";")[0].equals("null")
                       ? null
                       : teacherAuditory.split(";")[0],
                   parity);
           lessonList.add(lesson);
           log.debug(lesson);
         }
       }
       i++;
     }
     numberOfDay++;
   }
   if (!lessonList.isEmpty()) {
     Group group = new Group();
     group.setListLessons(lessonList);
     return group;
   } else {
     return null;
   }
 }
 private Element extend(Element sup, Element sub) {
   // Get the child elements for both the sup (super) element and the sub
   // (extended) element.
   Elements subElements = sub.children();
   Elements supElements = sup.children().clone();
   // For each element in the sub group,
   loop:
   for (Element e : subElements) {
     // If it's overridden, delete it from sup.
     if (e.hasAttr(Language.OVERRIDE_ATTRIBUTE)) {
       for (Element el : supElements) {
         if (el.attr(Language.IDENTIFICATION_ATTRIBUTE)
             .equals(e.attr(Language.IDENTIFICATION_ATTRIBUTE))) {
           supElements.remove(el);
           continue loop;
         }
       }
       // Fail silently if no element is found to override.
       continue loop;
     } else if (Language.isOverridden(e)) {
       // Some elements are automatically overridden if they exist.
       for (Element el : supElements) {
         if (el.tagName().equals(e.tagName())) {
           supElements.remove(el);
           continue loop;
         }
       }
       // Fail silently if no element is found to override.
       continue loop;
     } else if (e.tagName().equals("meta")) { // If this is a meta tag,
       if (e.hasAttr("name")) { // If it's got a name,
         Elements metaThatMatch =
             supElements.select(
                 "meta[name=\""
                     + e.attr("name")
                     + "\"]"); // Find and override the meta tag in supElements with that name.
         if (metaThatMatch.size() == 1) {
           supElements.remove(supElements.indexOf(metaThatMatch.first()));
         }
       } else if (e.hasAttr("http-equiv")) { // If it's got a http-equiv,
         Elements metaThatMatch =
             supElements.select(
                 "meta[http-equiv=\""
                     + e.attr("http-equiv")
                     + "\"]"); // Find and override the meta tag in supElements with that
                               // http-equiv.
         if (metaThatMatch.size() == 1) {
           supElements.remove(supElements.indexOf(metaThatMatch.first()));
         }
       }
     } else {
       // If it's not overridden but does correspond to an element,
       // recursively extend it.
       for (Element el : supElements) {
         if (el.hasAttr(Language.IDENTIFICATION_ATTRIBUTE)
             && el.attr(Language.IDENTIFICATION_ATTRIBUTE)
                 .equals(e.attr(Language.IDENTIFICATION_ATTRIBUTE))) {
           Element temp = extend(el.clone(), e.clone()).clone();
           e.replaceWith(temp);
           supElements.remove(el);
           continue loop;
         }
       }
     }
   }
   // Add the elements from the sup to the beginning of sub. This is where
   // the real extension happens.
   Collections.reverse(supElements);
   for (Element e : supElements) {
     sub.prependChild(e.clone());
   }
   return sub;
 }
  @Override
  public SNComments parseDocument(Document doc) throws Exception {
    SNComments comments = new SNComments();
    if (doc == null) {
      return comments;
    }
    Elements tableRows = doc.body().select("table tr table tr");
    if (tableRows != null && tableRows.size() > 0) {
      tableRows.remove(0);
      // 获取下一页链接
      Elements moreURLElements = tableRows.select("a:matches(More)");
      String moreURL = null;
      if (moreURLElements.size() > 0) {
        moreURL = resolveRelativeSNURL(moreURLElements.attr("href"));
      }
      comments.setMoreURL(moreURL);
      String linkURL = null;
      String parentURL = null;
      String discussURL = null;
      String text = null;
      String created = null;
      SNUser user = null;
      String artistTitle = null; // 文章标题
      String voteURL = null;
      for (int row = 0; row < tableRows.size(); row++) {
        int rowInPost = row % 2;
        Element rowElement = tableRows.get(row);
        if (rowInPost == 0) {
          Element textElement = rowElement.select("tr > td:eq(1) > span").first();
          if (textElement == null) {
            break;
          }
          text = textElement.text();
          user = new SNUser();

          Element spanElement = rowElement.select("tr > td:eq(1) > div > span").first();
          created = getCreateAt(spanElement.text());
          Elements aElements = spanElement.select("span > a");
          if (aElements != null && aElements.size() >= 4) {
            int size = aElements.size();
            Element anthorURLElement = aElements.first();
            user.setId(anthorURLElement.text());
            Element linkURLElement = aElements.get(1);
            linkURL = resolveRelativeSNURL(linkURLElement.attr("href"));
            Element parentURLElement = aElements.get(2);
            parentURL = resolveRelativeSNURL(parentURLElement.attr("href"));
            Element artistAElement = aElements.last();
            discussURL = resolveRelativeSNURL(artistAElement.attr("href"));
            artistTitle = artistAElement.text();
            if (size == 6) {
              // TODO edit delete
            }
          }

          Element voteAElement = rowElement.select("tr > td:eq(0) a").first();
          if (voteAElement != null) {
            // 登录用户的评论没有url
            voteURL = resolveRelativeSNURL(voteAElement.attr("href"));
          }
          comments.addComment(
              new SNComment(
                  linkURL, parentURL, discussURL, text, created, user, artistTitle, voteURL, null));
        }
      }
    }
    return comments;
  }
  /**
   * 解析数据
   *
   * @param rows 源数据集
   * @param column 被解释数据所在列
   * @return 节目数据
   */
  private static String[][] parseRows(Elements rows, int column) {

    List<List<Integer>> rowspan = new ArrayList<List<Integer>>();
    List<List<String[]>> programsList = new ArrayList<List<String[]>>();

    String[][] temp_programs = new String[rows.size()][2];
    String[][] programs = temp_programs;
    int[] rowspan_rest = new int[8];

    for (int i = 0; i < rows.size(); i++) {
      Element row = rows.get(i);

      try {
        Elements cells = row.children();

        List<Integer> rowspanlist = new ArrayList<Integer>();
        if (null != cells && cells.size() > 0) {
          List<String[]> data = new ArrayList<String[]>();
          for (int m = 0; m < 8; m++) {

            if (rowspan_rest[m] > 1) {
              rowspanlist.add(Integer.valueOf(0));
              String[] program = {"", ""};
              data.add(program);

              rowspan_rest[m]--;

            } else {
              Element cell = cells.get(0);
              Integer span = Integer.valueOf(cell.attr("rowspan"));
              rowspanlist.add(span);
              String[] program = new String[2];
              program[0] = DBclass.xmlFilte(cell.select("dt").text());
              program[1] = DBclass.xmlFilte(cell.select("dd").text());
              data.add(program);

              cells.remove(0);

              rowspan_rest[m] = span.intValue();
            }
          }

          programsList.add(data);

        } else {
          List<String[]> data = new ArrayList<String[]>();
          for (int r = 0; r < 8; r++) {
            rowspanlist.add(Integer.valueOf(0));
            String[] program = {"", ""};
            data.add(program);

            rowspan_rest[r]--;
          }

          programsList.add(data);
        }

        rowspan.add(rowspanlist);

      } catch (Exception e) {
        e.printStackTrace(System.out);
      }
    }

    int l = 0;
    for (int n = 0; n < programsList.size(); n++, l++) {
      List<String[]> data = programsList.get(n);
      String[] program = data.get(column);
      if (!StringUtils.isNullOrEmpty(program[0]) && !StringUtils.isNullOrEmpty(program[1])) {
        temp_programs[l][0] = program[0];
        temp_programs[l][1] = program[1];
      } else {
        l--;
      }
    }

    programs = new String[l][2];
    for (int m = 0; m < l; m++) {
      programs[m][0] = temp_programs[m][0];
      programs[m][1] = temp_programs[m][1];
    }

    return programs;
  }