public static void main(String[] args) throws IOException { // Pobranie pliku HTML Document doc = Jsoup.connect(url).get(); // Wybieramy TAGI: TABLE gdzie width=720 i nie te dla których border=0 Elements media = doc.select("table[width=720]").not("[border=0]"); System.out.println("TABLES: " + media.size()); // jest 6 tabel // InteresujÄ… nas tylko te tabelki dla których: // pierwszy i ostatni rzÄ…d (TR) jest taki sam... for (Element src : media) { // przelatuje po tabelach (szeœciu) System.out.println("--------------------------------"); Elements media2 = src.select("tr"); System.out.println("TR: " + media2.size()); Element first = media2.get(0); // System.out.println(media2.get(0).select("td").size()); String sFirst = first.text(); // System.out.println(first); System.out.println("text=" + sFirst); Element last = media2.get(media2.size() - 1); // System.out.println(media2.get(media2.size()-1).select("td").size()); String sLast = last.text(); // System.out.println(last); System.out.println("text=" + sLast); System.out.println(sFirst.equals(sLast)); if (sFirst.equals(sLast)) { System.out.println( "data zmiany: " + sFirst.split("last updated", 2)[1].split(" - ")[0].trim()); // System.out.println(media2.size()); media2.remove(0); media2.remove(media2.size() - 1); System.out.println("ROWS: " + media2.size()); } else { System.out.println("NOT OK"); } } }
public static void main(String args[]) throws IOException { // Element.ownText() // Step 1: To extract all labels and instances... Document doc = Jsoup.connect("http://127.0.0.1/master%20project/websites/home.php").get(); Elements labelElements = doc.getElementsByAttributeValue("id", "label"); Elements instanceElements = doc.getElementsByAttributeValue("id", "instance"); // Step 2: To pair C(l,i) using single link clustering algorithm... NOTE: special Date case... HashMap<String, String[]> singleLinkClusterMap = new HashMap<String, String[]>(); // singleLinkClusterMap.put(key, value) for (int i = 0; i < labelElements.size(); i++) { // Keys... String key = labelElements.select("[tag=" + i + "]").text(); if (!key.toLowerCase().equals("date")) { // Values... Elements instanceElementsForThisKey = instanceElements.select("[tag=" + i + "]"); String[] values = new String[instanceElementsForThisKey.size()]; for (int j = 0; j < instanceElementsForThisKey.size(); j++) { values[j] = instanceElementsForThisKey.remove(0).text(); } singleLinkClusterMap.put(key, values); } else { Date date = new Date(); String modifiedDate = new SimpleDateFormat("yyyy-MM-dd").format(date); String[] values = {modifiedDate.toString()}; singleLinkClusterMap.put(key, values); } } System.out.println("label:" + singleLinkClusterMap); // Step 3: To create base Ontology // TEST: to fire the source page with a request query then extract the data from resulting // page... // Step 4: To create one(or more) slave to which the base ontology and interface website address // is sent to. // This(These) slaves will then repeat steps 1 and 2 then create their own Ontology O' // Step 5: The new ontology O' will then be sent back to the Master to merge with original O. }
public JSONArray toRainChanceJSON(String html, String[] labels) { Document doc = Jsoup.parse(html); JSONArray times = new JSONArray(); try { Element table = doc.select("table").first(); Elements rows = table.select("tr"); for (Element row : rows) { Elements data = row.select("td"); data.remove(0); if (!data.isEmpty()) { JSONObject details = new JSONObject(); String str = data.get(0).text(); String delims = "[ ]"; String[] tokens = str.split(delims); details.put(labels[0], tokens[1]); str = data.get(1).text(); delims = "[()]"; tokens = str.split(delims); Log.d("jsoup", "str: " + str); details.put(labels[1], tokens[1]); // for(Element dataItem : data){ // details.put(labels[data.indexOf(dataItem)], dataItem.text()); // } times.put(details); } } } catch (JSONException e) { e.printStackTrace(); } // Log.d("jsoup", times.toString()); // return s; return times; }
public static String updateAFGXml(boolean isActivate, String target, String ectXml) { String conditionStr = isActivate ? "<cp:conditions/>" : "<cp:conditions><ss:rule-deactivated/></cp:conditions>"; Document doc = Jsoup.parse(ectXml, "UTF-8"); Elements ruleAudio = doc.select("cp|rule[id=cfu] "); Elements ruleAudioCondition = ruleAudio.select("cp|conditions"); ruleAudioCondition.remove(); // we cant change it to "<cp:conditions/> directly ruleAudio.prepend(conditionStr); Elements ruleAudioForwardTarget = ruleAudio.select("ss|forward-to>ss|target"); ruleAudioForwardTarget.html(target); String r = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"; r += doc.getElementsByTag("ss:simservs").outerHtml(); // modify for jsoup problem r = r.replaceAll("noreplytimer", "NoReplyTimer"); // r= r.replaceAll("\n", ""); r = r.replaceAll(">\\s+(.+)\\s+<", ">$1<"); return r; }
@Override public HNFeed parseDocument(Document doc) throws Exception { if (doc == null) return new HNFeed(); ArrayList<HNPost> posts = new ArrayList<HNPost>(); // clumsy, but hopefully stable query - first element retrieved is the // top table, we have to skip that: Elements tableRows = doc.select("table tr table tr"); tableRows.remove(0); Elements nextPageURLElements = tableRows.select("a:matches(More)"); String nextPageURL = null; if (nextPageURLElements.size() > 0) nextPageURL = resolveRelativeHNURL(nextPageURLElements.attr("href")); String url = null; String title = null; String author = null; int commentsCount = 0; int points = 0; String urlDomain = null; String postID = null; boolean endParsing = false; for (int row = 0; row < tableRows.size(); row++) { int rowInPost = row % 3; Element rowElement = tableRows.get(row); switch (rowInPost) { case 0: Element e1 = rowElement.select("tr > td:eq(2) > a").first(); if (e1 == null) { endParsing = true; break; } title = e1.text(); url = resolveRelativeHNURL(e1.attr("href")); urlDomain = getDomainName(url); break; case 1: points = getIntValueFollowedBySuffix(rowElement.select("tr > td:eq(1) > span").text(), " p"); author = rowElement.select("tr > td:eq(1) > a[href*=user]").text(); Element e2 = rowElement.select("tr > td:eq(1) > a[href*=item]").first(); if (e2 != null) { commentsCount = getIntValueFollowedBySuffix(e2.text(), " c"); if (commentsCount == BaseHTMLParser.UNDEFINED && e2.text().contains("discuss")) commentsCount = 0; postID = getStringValuePrefixedByPrefix(e2.attr("href"), "id="); } else commentsCount = BaseHTMLParser.UNDEFINED; posts.add(new HNPost(url, title, urlDomain, author, postID, commentsCount, points)); break; default: break; } if (endParsing) break; } return new HNFeed(posts, nextPageURL); }
@Override protected void onHandleIntent(Intent intent) { NotificationCompat.Builder mBuilder = new NotificationCompat.Builder(this) .setSmallIcon(R.drawable.ic_notification) .setProgress(100, 0, false) .setOngoing(true) .setContentTitle(getResources().getString(R.string.loading_offline_whatif)) .setAutoCancel(true); NotificationManager mNotificationManager = (NotificationManager) getSystemService(NOTIFICATION_SERVICE); mNotificationManager.notify(1, mBuilder.build()); PrefHelper prefHelper = new PrefHelper(getApplicationContext()); File sdCard = prefHelper.getOfflinePath(); File dir = new File(sdCard.getAbsolutePath() + OFFLINE_WHATIF_OVERVIEW_PATH); OkHttpClient client = new OkHttpClient(); Document doc; if (!dir.exists()) dir.mkdirs(); // download overview if (!BuildConfig.DEBUG) { try { doc = Jsoup.connect("https://what-if.xkcd.com/archive/") .userAgent( "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.19 Safari/537.36") .get(); StringBuilder sb = new StringBuilder(); Elements titles = doc.select("h1"); prefHelper.setNewestWhatif(titles.size()); sb.append(titles.first().text()); titles.remove(0); for (Element title : titles) { sb.append("&&"); sb.append(title.text()); } prefHelper.setWhatIfTitles(sb.toString()); Elements img = doc.select("img.archive-image"); int count = 1; for (Element image : img) { String url = image.absUrl("src"); try { Request request = new Request.Builder().url(url).build(); Response response = client.newCall(request).execute(); File file = new File(dir, String.valueOf(count) + ".png"); BufferedSink sink = Okio.buffer(Okio.sink(file)); sink.writeAll(response.body().source()); sink.close(); response.body().close(); } catch (Exception e) { e.printStackTrace(); } int p = (int) (count / ((float) img.size()) * 100); mBuilder.setProgress(100, p, false); mNotificationManager.notify(1, mBuilder.build()); count++; } if (prefHelper.getNewestWhatIf() == 0) prefHelper.setNewestWhatif(count - 1); } catch (IOException e) { e.printStackTrace(); } // download html int size = prefHelper.getNewestWhatIf(); for (int i = 1; i <= size; i++) { try { doc = Jsoup.connect("https://what-if.xkcd.com/" + String.valueOf(i)).get(); dir = new File(sdCard.getAbsolutePath() + OFFLINE_WHATIF_PATH + String.valueOf(i)); dir.mkdirs(); File file = new File(dir, String.valueOf(i) + ".html"); BufferedWriter writer = new BufferedWriter(new FileWriter(file)); writer.write(doc.outerHtml()); writer.close(); // download images int count = 1; for (Element e : doc.select(".illustration")) { try { String url = "http://what-if.xkcd.com" + e.attr("src"); Request request = new Request.Builder().url(url).build(); Response response = client.newCall(request).execute(); dir = new File(sdCard.getAbsolutePath() + OFFLINE_WHATIF_PATH + String.valueOf(i)); if (!dir.exists()) dir.mkdirs(); file = new File(dir, String.valueOf(count) + ".png"); BufferedSink sink = Okio.buffer(Okio.sink(file)); sink.writeAll(response.body().source()); sink.close(); response.body().close(); count++; } catch (Exception e2) { Log.e("article" + i, e2.getMessage()); } } int p = (int) (i / ((float) size) * 100); mBuilder.setProgress(100, p, false); mBuilder.setContentText(i + "/" + size); mNotificationManager.notify(1, mBuilder.build()); } catch (Exception e) { Log.e("article" + i, e.getMessage()); } } } prefHelper.setSunbeamLoaded(); Intent restart = new Intent("de.tap.easy_xkcd.ACTION_COMIC"); restart.putExtra("number", prefHelper.getLastComic()); PendingIntent pendingIntent = PendingIntent.getActivity(this, 1, restart, PendingIntent.FLAG_UPDATE_CURRENT); mBuilder .setContentIntent(pendingIntent) .setContentText(getResources().getString(R.string.not_restart)); mNotificationManager.notify(1, mBuilder.build()); }
/** * Получение группы с расписанием * * @param schedule * @return group */ public static Group getSchedule(Element schedule) { Elements trs = schedule.getElementsByTag(Constants.TR); trs.remove(0); trs.remove(0); int numberOfDay = 0; List<Lesson> lessonList = new ArrayList<Lesson>(); Long parity = null; for (Element tr : trs) { int checkParity = 0; if (tr.getElementsByTag("img").size() != 0) { if (tr.getElementsByTag("img").first().attr("src").equals("data/1.gif")) { parity = new Long(2); } else { parity = new Long(1); } checkParity = 1; numberOfDay = 1; } Elements tds = tr.getElementsByTag(Constants.TD); int i = 1; if (checkParity == 1) { tds.remove(0); } tds.remove(0); for (Element td : tds) { String time = getRings().get(i); String timeStart = time.split("-")[0]; String timeEnd = time.split("-")[1]; if (!td.text().replaceAll("\\u00A0", "").equals("")) { String subElement = td.text(); int type = CommonUtils.getTypeOfLesson(getType(subElement)); String sub = subElement.replace(getType(subElement), ""); String subSplit = null; if (sub.matches("(.*)ест\\.(.*)")) { subSplit = sub.replaceAll("в проф\\.деят-сти", "в профессиональной деятельности") .replaceAll("Дальневост.рег", "Дальневосточного региона") .replace("а. сз", "а.сз") .replaceAll("асс\\.", ";") .replaceAll("ст\\.пр\\.", ";") .replaceAll("доц\\.", ";") .replaceAll("проф\\.", ";"); } else { subSplit = sub.replaceAll("в проф\\.деят-сти", "в профессиональной деятельности") .replaceAll("Дальневост.рег", "Дальневосточного региона") .replace("а. сз", "а.сз") .replaceAll("асс\\.", ";") .replaceAll("ст\\.пр\\.", ";") .replaceAll("ст\\.", ";") .replaceAll("доц\\.", ";") .replaceAll("проф\\.", ";"); } String subList[] = subSplit.split(";"); String subject, sbj; List<String> teachersAuditories = new ArrayList<String>(); if (subList.length == 1) { sbj = subList[0].substring(0, subList[0].lastIndexOf(" ")).trim().toLowerCase(); subject = sbj.substring(0, 1).toUpperCase().concat(sbj.substring(1)); teachersAuditories.add( "null;".concat(subList[0].substring(subList[0].lastIndexOf(" ") + 1))); } else { sbj = subList[0].trim().toLowerCase(); subject = sbj.substring(0, 1).toUpperCase().concat(sbj.substring(1)); if (subList.length == 2) { teachersAuditories.add( subList[1] .substring(0, subList[1].lastIndexOf(" ")) .trim() .concat(";") .concat(subList[1].substring(subList[1].lastIndexOf(" ") + 1))); } else if (subList.length == 3) { teachersAuditories.add( subList[1] .trim() .substring(0, subList[1].trim().lastIndexOf(" ")) .trim() .concat(";") .concat(subList[1].trim().substring(subList[1].trim().lastIndexOf(" ") + 1))); if (subList[2].contains("-")) { teachersAuditories.add( subList[2] .trim() .substring(0, subList[2].trim().lastIndexOf("-")) .trim() .concat(";") .concat( subList[2] .trim() .substring(subList[2].trim().lastIndexOf("-") + 1) .trim())); } else { teachersAuditories.add( subList[2] .trim() .substring(0, subList[2].trim().lastIndexOf(" ")) .trim() .concat(";") .concat( subList[2].trim().substring(subList[2].trim().lastIndexOf(" ") + 1))); } } } for (String teacherAuditory : teachersAuditories) { Lesson lesson = new Lesson( subject, numberOfDay + "", null, timeStart, timeEnd, type, teacherAuditory.split(";")[1], null, teacherAuditory.split(";")[0].equals("null") ? null : teacherAuditory.split(";")[0], parity); lessonList.add(lesson); log.debug(lesson); } } i++; } numberOfDay++; } if (!lessonList.isEmpty()) { Group group = new Group(); group.setListLessons(lessonList); return group; } else { return null; } }
private Element extend(Element sup, Element sub) { // Get the child elements for both the sup (super) element and the sub // (extended) element. Elements subElements = sub.children(); Elements supElements = sup.children().clone(); // For each element in the sub group, loop: for (Element e : subElements) { // If it's overridden, delete it from sup. if (e.hasAttr(Language.OVERRIDE_ATTRIBUTE)) { for (Element el : supElements) { if (el.attr(Language.IDENTIFICATION_ATTRIBUTE) .equals(e.attr(Language.IDENTIFICATION_ATTRIBUTE))) { supElements.remove(el); continue loop; } } // Fail silently if no element is found to override. continue loop; } else if (Language.isOverridden(e)) { // Some elements are automatically overridden if they exist. for (Element el : supElements) { if (el.tagName().equals(e.tagName())) { supElements.remove(el); continue loop; } } // Fail silently if no element is found to override. continue loop; } else if (e.tagName().equals("meta")) { // If this is a meta tag, if (e.hasAttr("name")) { // If it's got a name, Elements metaThatMatch = supElements.select( "meta[name=\"" + e.attr("name") + "\"]"); // Find and override the meta tag in supElements with that name. if (metaThatMatch.size() == 1) { supElements.remove(supElements.indexOf(metaThatMatch.first())); } } else if (e.hasAttr("http-equiv")) { // If it's got a http-equiv, Elements metaThatMatch = supElements.select( "meta[http-equiv=\"" + e.attr("http-equiv") + "\"]"); // Find and override the meta tag in supElements with that // http-equiv. if (metaThatMatch.size() == 1) { supElements.remove(supElements.indexOf(metaThatMatch.first())); } } } else { // If it's not overridden but does correspond to an element, // recursively extend it. for (Element el : supElements) { if (el.hasAttr(Language.IDENTIFICATION_ATTRIBUTE) && el.attr(Language.IDENTIFICATION_ATTRIBUTE) .equals(e.attr(Language.IDENTIFICATION_ATTRIBUTE))) { Element temp = extend(el.clone(), e.clone()).clone(); e.replaceWith(temp); supElements.remove(el); continue loop; } } } } // Add the elements from the sup to the beginning of sub. This is where // the real extension happens. Collections.reverse(supElements); for (Element e : supElements) { sub.prependChild(e.clone()); } return sub; }
@Override public SNComments parseDocument(Document doc) throws Exception { SNComments comments = new SNComments(); if (doc == null) { return comments; } Elements tableRows = doc.body().select("table tr table tr"); if (tableRows != null && tableRows.size() > 0) { tableRows.remove(0); // 获取下一页链接 Elements moreURLElements = tableRows.select("a:matches(More)"); String moreURL = null; if (moreURLElements.size() > 0) { moreURL = resolveRelativeSNURL(moreURLElements.attr("href")); } comments.setMoreURL(moreURL); String linkURL = null; String parentURL = null; String discussURL = null; String text = null; String created = null; SNUser user = null; String artistTitle = null; // 文章标题 String voteURL = null; for (int row = 0; row < tableRows.size(); row++) { int rowInPost = row % 2; Element rowElement = tableRows.get(row); if (rowInPost == 0) { Element textElement = rowElement.select("tr > td:eq(1) > span").first(); if (textElement == null) { break; } text = textElement.text(); user = new SNUser(); Element spanElement = rowElement.select("tr > td:eq(1) > div > span").first(); created = getCreateAt(spanElement.text()); Elements aElements = spanElement.select("span > a"); if (aElements != null && aElements.size() >= 4) { int size = aElements.size(); Element anthorURLElement = aElements.first(); user.setId(anthorURLElement.text()); Element linkURLElement = aElements.get(1); linkURL = resolveRelativeSNURL(linkURLElement.attr("href")); Element parentURLElement = aElements.get(2); parentURL = resolveRelativeSNURL(parentURLElement.attr("href")); Element artistAElement = aElements.last(); discussURL = resolveRelativeSNURL(artistAElement.attr("href")); artistTitle = artistAElement.text(); if (size == 6) { // TODO edit delete } } Element voteAElement = rowElement.select("tr > td:eq(0) a").first(); if (voteAElement != null) { // 登录用户的评论没有url voteURL = resolveRelativeSNURL(voteAElement.attr("href")); } comments.addComment( new SNComment( linkURL, parentURL, discussURL, text, created, user, artistTitle, voteURL, null)); } } } return comments; }
/** * 解析数据 * * @param rows 源数据集 * @param column 被解释数据所在列 * @return 节目数据 */ private static String[][] parseRows(Elements rows, int column) { List<List<Integer>> rowspan = new ArrayList<List<Integer>>(); List<List<String[]>> programsList = new ArrayList<List<String[]>>(); String[][] temp_programs = new String[rows.size()][2]; String[][] programs = temp_programs; int[] rowspan_rest = new int[8]; for (int i = 0; i < rows.size(); i++) { Element row = rows.get(i); try { Elements cells = row.children(); List<Integer> rowspanlist = new ArrayList<Integer>(); if (null != cells && cells.size() > 0) { List<String[]> data = new ArrayList<String[]>(); for (int m = 0; m < 8; m++) { if (rowspan_rest[m] > 1) { rowspanlist.add(Integer.valueOf(0)); String[] program = {"", ""}; data.add(program); rowspan_rest[m]--; } else { Element cell = cells.get(0); Integer span = Integer.valueOf(cell.attr("rowspan")); rowspanlist.add(span); String[] program = new String[2]; program[0] = DBclass.xmlFilte(cell.select("dt").text()); program[1] = DBclass.xmlFilte(cell.select("dd").text()); data.add(program); cells.remove(0); rowspan_rest[m] = span.intValue(); } } programsList.add(data); } else { List<String[]> data = new ArrayList<String[]>(); for (int r = 0; r < 8; r++) { rowspanlist.add(Integer.valueOf(0)); String[] program = {"", ""}; data.add(program); rowspan_rest[r]--; } programsList.add(data); } rowspan.add(rowspanlist); } catch (Exception e) { e.printStackTrace(System.out); } } int l = 0; for (int n = 0; n < programsList.size(); n++, l++) { List<String[]> data = programsList.get(n); String[] program = data.get(column); if (!StringUtils.isNullOrEmpty(program[0]) && !StringUtils.isNullOrEmpty(program[1])) { temp_programs[l][0] = program[0]; temp_programs[l][1] = program[1]; } else { l--; } } programs = new String[l][2]; for (int m = 0; m < l; m++) { programs[m][0] = temp_programs[m][0]; programs[m][1] = temp_programs[m][1]; } return programs; }