Ejemplo n.º 1
0
  /**
   * Reads an Item from a design and inserts it into the data source. Recursively handles any
   * children of the item as well.
   *
   * @since 7.5.0
   * @param node an element representing the item (tree node).
   * @param selected A set accumulating selected items. If the item that is read is marked as
   *     selected, its item id should be added to this set.
   * @param context the DesignContext instance used in parsing
   * @return the item id of the new item
   * @throws DesignException if the tag name of the {@code node} element is not {@code node}.
   */
  @Override
  protected String readItem(Element node, Set<String> selected, DesignContext context) {

    if (!"node".equals(node.tagName())) {
      throw new DesignException(
          "Unrecognized child element in " + getClass().getSimpleName() + ": " + node.tagName());
    }

    String itemId = node.attr("text");
    addItem(itemId);
    if (node.hasAttr("icon")) {
      Resource icon =
          DesignAttributeHandler.readAttribute("icon", node.attributes(), Resource.class);
      setItemIcon(itemId, icon);
    }
    if (node.hasAttr("selected")) {
      selected.add(itemId);
    }

    for (Element child : node.children()) {
      String childItemId = readItem(child, selected, context);
      setParent(childItemId, itemId);
    }
    return itemId;
  }
 /**
  * Check if a HTML table is not complex. Typically a complex table has at least one cell which is
  * a multi column or multi row. In those cases it is required that the table use heading id
  * alignment to aid screen readers.
  *
  * @param table to check
  * @return true if the table is not complex.
  */
 static boolean notComplexTable(Element table) {
   // @todo(dallison) Define a heuristic for this starting with > 1
   for (Element cell : table.select(TH + ", " + TD)) {
     if (cell.hasAttr(COLSPAN) || cell.hasAttr(ROWSPAN)) {
       return false;
     }
   }
   return true;
 }
Ejemplo n.º 3
0
  // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to
  // support
  // switching the chartset midstream when a meta http-equiv tag defines the charset.
  // todo - this is getting gnarly. needs a rewrite.
  static Document parseByteData(
      ByteBuffer byteData, String charsetName, String baseUri, Parser parser) {
    String docData;
    Document doc = null;

    // look for BOM - overrides any other header or input
    charsetName = detectCharsetFromBom(byteData, charsetName);

    if (charsetName == null) { // determine from meta. safe first parse as UTF-8
      // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta
      // charset="gb2312">
      docData = Charset.forName(defaultCharset).decode(byteData).toString();
      doc = parser.parseInput(docData, baseUri);
      Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
      String foundCharset = null; // if not found, will keep utf-8 as best attempt
      if (meta != null) {
        if (meta.hasAttr("http-equiv")) {
          foundCharset = getCharsetFromContentType(meta.attr("content"));
        }
        if (foundCharset == null && meta.hasAttr("charset")) {
          foundCharset = meta.attr("charset");
        }
      }
      // look for <?xml encoding='ISO-8859-1'?>
      if (foundCharset == null
          && doc.childNodeSize() > 0
          && doc.childNode(0) instanceof XmlDeclaration) {
        XmlDeclaration prolog = (XmlDeclaration) doc.childNode(0);
        if (prolog.name().equals("xml")) {
          foundCharset = prolog.attr("encoding");
        }
      }
      foundCharset = validateCharset(foundCharset);

      if (foundCharset != null && !foundCharset.equals(defaultCharset)) { // need to re-decode
        foundCharset = foundCharset.trim().replaceAll("[\"']", "");
        charsetName = foundCharset;
        byteData.rewind();
        docData = Charset.forName(foundCharset).decode(byteData).toString();
        doc = null;
      }
    } else { // specified by content type header (or by user on file load)
      Validate.notEmpty(
          charsetName,
          "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
      docData = Charset.forName(charsetName).decode(byteData).toString();
    }
    if (doc == null) {
      doc = parser.parseInput(docData, baseUri);
      doc.outputSettings().charset(charsetName);
    }
    return doc;
  }
Ejemplo n.º 4
0
  public static Pupil getSelectedPupil(Document doc) throws ParseException {

    boolean found = false;
    Pupil p, selectedP = null;

    Elements pupilSelectors =
        doc.getElementsByAttributeValue("id", "ctl00_topMenu_pupil_drdPupils");
    for (Element pupilSelector : pupilSelectors) {

      Elements pupils = pupilSelector.getAllElements();
      for (Element pupil : pupils) {
        if (pupil.tagName().equals("option")) {

          String value = pupil.attr("value");

          found = true;
          if ((p = Pupil.getByFormId(value)) == null) {

            p = new Pupil(pupil.text(), value);
            long rowId = p.insert();

            if (BuildConfig.DEBUG)
              Log.d("GshisHTMLParser", TS.get() + " Pupil.insert() = " + rowId);
          }

          if (pupil.hasAttr("selected") && pupil.attr("selected").equals("selected")) {

            selectedP = p;
          }
        }
      }
    }

    if (!found) {

      if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " Alternative fields found!");

      Element userName = doc.getElementsByClass("user-name").first();
      Element userId = doc.getElementsByAttributeValue("id", "ctl00_topMenu_tbUserId").first();

      String name = userName.text();
      String id = userId.attr("value");

      if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " name=" + name + " id=" + id);

      if ((p = Pupil.getByFormId(id)) == null) {

        p = new Pupil(name, id);
        long rowId = p.insert();

        if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " Pupil.insert() = " + rowId);
      }

      selectedP = p;
    }

    if (selectedP == null) throw new ParseException("Pupils not found", 0);

    return selectedP;
  }
Ejemplo n.º 5
0
  @SuppressLint("DefaultLocale")
  private String improveHtml(final String html) {

    final Document document = Jsoup.parse(html);

    for (final Element e : document.getAllElements()) {
      if (e.hasAttr("style")) {

        for (final Attribute a : e.attributes()) {
          if (a.getKey().compareTo("style") == 0) {
            final String[] items = a.getValue().trim().split(";");
            String newValue = "";
            for (final String item : items) {
              if (!item.toLowerCase(Locale.ENGLISH).contains("font-family:")
                  && !item.toLowerCase(Locale.ENGLISH).contains("font-size:")) {
                newValue = newValue.concat(item).concat(";");
              }
            }
            a.setValue(newValue);
          }
        }
      }
    }

    return document.body().html();
  }
Ejemplo n.º 6
0
 @Override
 public void upload(
     ComponentParameter compParameter,
     IMultipartFile multipartFile,
     HashMap<String, Object> json) {
   try {
     ID id = ItSiteUtil.getLoginUser(compParameter).getId();
     if (id != null) {
       final Document document =
           Jsoup.parse(
               multipartFile.getInputStream(), compParameter.request.getCharacterEncoding(), "");
       final Elements as = document.getElementsByTag("a");
       for (final Element a : as) {
         if (a.hasAttr("add_date")) {
           final BookmarkBean bean = new BookmarkBean();
           final long t = ConvertUtils.toLong(a.attr("add_date"), 0) * 1000;
           bean.setTitle(a.text());
           bean.setUrl(a.attr("href"));
           bean.setUserId(id);
           bean.setUpdateDate(new Date(t));
           try {
             BookmarkUtils.applicationModule.doUpdate(bean);
           } catch (Exception e) {
           }
         }
       }
     }
   } catch (final Exception e) {
     throw DataObjectException.wrapException("没有权限");
   }
 }
Ejemplo n.º 7
0
  public ArrayList<DataStructure> parseXML(String xmlUrl, String whatForm) throws IOException {
    System.err.println("Creating an XML database");
    File file = new File(xmlUrl);
    Document document = Jsoup.parse(file, "UTF-8");

    String kopuk;
    ArrayList<DataStructure> xmlData = new ArrayList<>();
    Elements linkFrom = document.select("FORM");
    for (Element link : linkFrom) {
      Map<String, Map<String, Map<String, String>>> hashMapGlobal = new HashMap<>();
      if (link.hasAttr("KO")) {
        kopuk = link.attr("KO");
      } else {
        kopuk = link.attr("KOPUK");
      }
      Elements part;
      if (!document.select("PodPart").isEmpty()) {
        part = link.select("PodPart");
      } else {
        part = link.select("Part");
      }
      for (Element ichPart : part) {
        Map<String, Map<String, String>> hashMapPart = new HashMap<>();
        Elements rows = ichPart.select("Row");
        for (Element ichRow : rows) {
          Map<String, Map<String, String>> hashMapRow = new HashMap<>();
          Map<String, String> hashMapGrahp = new HashMap<>();
          Elements graph = ichRow.select("graph");
          for (Element ichGraph : graph) {
            hashMapGrahp.put(ichGraph.attr("seqNum"), ichGraph.text());
          }
          hashMapPart.put(ichRow.attr("seqNum"), hashMapGrahp);
          hashMapRow.clear();
        }
        if (Objects.equals(whatForm, "242") || Objects.equals(whatForm, "243")) {
          System.out.println("Find 242 243 ");
          if (hashMapGlobal.size() > 0) {
            System.out.println(" hashMapGlobal.size()>0 ");
            Map<String, Map<String, String>> hashMapRowOld = hashMapGlobal.get("1");
            for (String rowKey : hashMapPart.keySet()) {
              Map<String, String> hashMapPartIN = hashMapPart.get(rowKey);
              Map<String, String> hashMapPartOldIN = hashMapRowOld.get(rowKey);
              for (String graphKey : hashMapPartIN.keySet()) {
                hashMapPartOldIN.put(graphKey, hashMapPartIN.get(graphKey));
              }
              hashMapRowOld.put(rowKey, hashMapPartOldIN);
            }
            hashMapGlobal.put("1", hashMapRowOld);
          } else hashMapGlobal.put(ichPart.attr("seqNum"), hashMapPart);

        } else {
          hashMapGlobal.put(ichPart.attr("seqNum"), hashMapPart);
        }
      }
      xmlData.add(new DataStructure(kopuk, hashMapGlobal));
    }
    return xmlData;
  }
Ejemplo n.º 8
0
  public static Week getSelectedWeek(Document doc, Schedule s) throws ParseException {

    boolean found = false;
    Week selectedW = null;

    SimpleDateFormat f = new SimpleDateFormat("yyyy dd.MM", Locale.ENGLISH);
    f.setTimeZone(TimeZone.getTimeZone("Europe/Moscow"));

    Elements weekSelectors = doc.getElementsByAttributeValue("id", "ctl00_body_week_drdWeeks");
    for (Element weekSelector : weekSelectors) {

      Elements weeks = weekSelector.getAllElements();
      for (Element week : weeks) {
        if (week.tagName().equals("option")) {

          String value = week.text();
          Week w;
          found = true;

          if ((w = s.getWeek(week.attr("value"))) == null) {

            w = new Week();

            String wBegin = value.substring(0, value.indexOf("-") - 1);
            String wMonth = wBegin.substring(wBegin.indexOf(".") + 1, wBegin.length());

            String year;
            if (Integer.parseInt(wMonth) > 7) {
              year = s.getFormText().substring(0, s.getFormText().indexOf("-") - 1);
            } else {
              year =
                  s.getFormText()
                      .substring(s.getFormText().indexOf("-") + 2, s.getFormText().length());
            }

            w.setStart(f.parse(year + " " + wBegin));
            w.setFormText(week.text());
            w.setFormId(week.attr("value"));

            s.addWeek(w);
          }

          if (week.hasAttr("selected") && week.attr("selected").equals("selected")) {

            selectedW = w;
            long u = w.setLoaded().update();

            if (BuildConfig.DEBUG) Log.d("GshisHTMLParser", TS.get() + " Week.update() = " + u);
          }
        }
      }
    }

    if (!found) throw new ParseException("Weeks not found", 0);

    return selectedW;
  }
Ejemplo n.º 9
0
  @Override
  public List<String> getURLsFromPage(Document page) {
    List<String> imageURLs = new ArrayList<String>();
    Pattern p;
    Matcher m;
    for (Element link : page.select("a")) {
      if (!link.hasAttr("href")) {
        continue;
      }
      String href = link.attr("href").trim();

      if (isURLBlacklisted(href)) {
        continue;
      }
      // Check all blacklist items
      Boolean self_hosted = false;
      if (!generalChanSite) {
        for (String cdnDomain : chanSite.cdnDomains) {
          if (href.contains(cdnDomain)) {
            self_hosted = true;
          }
        }
      }

      if (self_hosted || generalChanSite) {
        p =
            Pattern.compile(
                "^.*\\.(jpg|jpeg|png|gif|apng|webp|tif|tiff|webm)$", Pattern.CASE_INSENSITIVE);
        m = p.matcher(href);
        if (m.matches()) {
          if (href.startsWith("//")) {
            href = "http:" + href;
          }
          if (href.startsWith("/")) {
            href = "http://" + this.url.getHost() + href;
          }
          // Don't download the same URL twice
          if (imageURLs.contains(href)) {
            logger.debug("Already attempted: " + href);
            continue;
          }
          imageURLs.add(href);
          if (isThisATest()) {
            break;
          }
        }
      } else {
        // TODO also grab imgur/flickr albums (And all other supported rippers) Maybe add a setting?
      }

      if (isStopped()) {
        break;
      }
    }
    return imageURLs;
  }
Ejemplo n.º 10
0
  public static String fetchLongCellString(Element e) {

    for (Element link : e.getElementsByTag("a")) {

      if (link.hasAttr("txttitle")) {
        return link.attr("txttitle");
      }
    }
    return e.text();
  }
 /**
  * Determines whether an element is not focusable using keys.
  *
  * <p>TODO(jharty): We need to determine which elements are generally focusable (where focus is
  * disabled by setting tabindex to a negative number) and which ones generally are NOT focusable,
  * but may become focusable by setting a positive tabindex. TODO(jharty): This could move to
  * OnClickIsFocusable?
  *
  * @param element the element to test
  * @return true when the element is not focusable, else false.
  */
 static boolean notFocusable(Element element) {
   List<String> focusable =
       Arrays.asList(new String[] {"a", "button", "input", "select", "textarea"});
   if (focusable.contains(element.tagName())) {
     if (element.hasAttr(TAB_INDEX) && -1 >= Integer.parseInt(element.attr(TAB_INDEX))) {
       return true;
     }
   }
   return false;
 }
Ejemplo n.º 12
0
  @Test
  public void testAddBooleanAttribute() {
    Element div = new Element(Tag.valueOf("div"), "");

    div.attr("true", true);

    div.attr("false", "value");
    div.attr("false", false);

    assertTrue(div.hasAttr("true"));
    assertEquals("", div.attr("true"));

    List<Attribute> attributes = div.attributes().asList();
    assertEquals("There should be one attribute", 1, attributes.size());
    assertTrue("Attribute should be boolean", attributes.get(0) instanceof BooleanAttribute);

    assertFalse(div.hasAttr("false"));

    assertEquals("<div true></div>", div.outerHtml());
  }
 // TODO(jharty) This has moved to NonInteractiveElementHasRole, could you
 // move the test also?
 static boolean hasAriaRole(Element element) {
   if (!element.hasAttr(ROLE)) {
     return false;
   }
   // search for role in
   for (Role role : Role.values()) {
     if (element.attr(ROLE).contains(role.toString())) {
       return true;
     }
   }
   return false;
 }
Ejemplo n.º 14
0
 private URL getGalleryFromImage(URL url) throws IOException {
   Document doc = Http.url(url).get();
   for (Element link : doc.select("a[href~=^gallery\\.php.*$]")) {
     logger.info("LINK: " + link.toString());
     if (link.hasAttr("href") && link.attr("href").contains("gallery.php")) {
       url = new URL("http://imagearn.com/" + link.attr("href"));
       logger.info("[!] Found gallery from given link: " + url);
       return url;
     }
   }
   throw new IOException("Failed to find gallery at URL " + url);
 }
Ejemplo n.º 15
0
 /**
  * @param element
  * @return
  */
 private boolean doesElementHaveRequestedTargetAttribute(Element element) {
   if (!element.hasAttr(TARGET_ATTR)) {
     return false;
   }
   String targetValue = element.attr(TARGET_ATTR);
   if (StringUtils.equalsIgnoreCase(targetValue, TOP_TARGET_VALUE)
       || StringUtils.equalsIgnoreCase(targetValue, PARENT_TARGET_VALUE)
       || StringUtils.equalsIgnoreCase(targetValue, SELF_TARGET_VALUE)) {
     return false;
   }
   return true;
 }
Ejemplo n.º 16
0
 @Override
 protected void select(SSPHandler sspHandler) {
   super.select(sspHandler);
   Iterator<Element> iter = getSelectionWithoutMarkerHandler().get().iterator();
   // The elements with a longdesc attribute are seen as informative.
   // They are added to the selection with marker
   while (iter.hasNext()) {
     Element el = iter.next();
     if (el.hasAttr(LONGDESC_ATTR)) {
       iter.remove();
       getSelectionWithMarkerHandler().add(el);
     }
   }
 }
Ejemplo n.º 17
0
  public static GradeSemester getActiveGradeSemester(Document doc, Schedule sch)
      throws ParseException {

    boolean found = false;
    GradeSemester selG = null;

    SimpleDateFormat fmt = new SimpleDateFormat("dd.MM.yyyy", Locale.ENGLISH);
    fmt.setTimeZone(TimeZone.getTimeZone("Europe/Moscow"));

    Elements semesterSelectors = doc.getElementsByAttributeValue("id", "ctl00_body_drdTerms");
    for (Element semesterSelector : semesterSelectors) {

      Elements semesters = semesterSelector.getAllElements();
      for (Element semester : semesters) {
        if (semester.tagName().equals("option")) {

          String value = semester.text();
          GradeSemester sem;
          found = true;

          if ((sem = sch.getSemester(semester.attr("value"))) == null) {

            sem = new GradeSemester();

            sem.setStart(fmt.parse(value.substring(12, value.indexOf("-") - 1)));
            sem.setStop(fmt.parse(value.substring(value.indexOf("-") + 2, value.length() - 2)));
            sem.setFormText(semester.text());
            sem.setFormId(semester.attr("value"));

            sch.addSemester(sem);
          }

          if (semester.hasAttr("selected") && semester.attr("selected").equals("selected")) {

            long u = sem.setLoaded().update();
            selG = sem;

            if (BuildConfig.DEBUG) Log.d("GshisHTMLParser", TS.get() + " Semester.update() = " + u);
          }
        }
      }
    }

    if (!found) throw new ParseException("Semesters not found", 0);

    return selG;
  }
Ejemplo n.º 18
0
  public static Schedule getSelectedSchedule(Document doc, Pupil selPupil) throws ParseException {

    boolean found = false;
    Schedule selectedS = null;

    Elements yearSelectors = doc.getElementsByAttributeValue("id", "ctl00_learnYear_drdLearnYears");
    for (Element yearSelector : yearSelectors) {

      Elements years = yearSelector.getAllElements();
      for (Element year : years) {
        if (year.tagName().equals("option")) {

          String value = year.attr("value");
          Schedule schedule;

          found = true;

          if ((schedule = selPupil.getScheduleByFormId(value)) == null) {

            final SimpleDateFormat f = new SimpleDateFormat("yyyy dd.MM", Locale.ENGLISH);
            f.setTimeZone(TimeZone.getTimeZone("Europe/Moscow"));
            schedule = new Schedule(value, year.text());

            Date start = f.parse(year.text().substring(0, year.text().indexOf("-") - 1) + " 01.09");
            Date stop =
                f.parse(
                    year.text().substring(year.text().indexOf("-") + 2, year.text().length())
                        + " 31.05");

            schedule.setStart(start);
            schedule.setStop(stop);

            selPupil.addSchedule(schedule);
          }

          if (year.hasAttr("selected") && year.attr("selected").equals("selected")) {

            selectedS = schedule;
          }
        }
      }
    }

    if (!found) throw new ParseException("Years not found", 0);

    return selectedS;
  }
Ejemplo n.º 19
0
 private Elements parseAttr(String query) {
   String value = getValue(query, ATTR_TAG);
   if (!value.contains("=")) {
     throw new IllegalArgumentException("Argument selector part: " + query + " is illegal");
   }
   if (value.indexOf("=") != value.lastIndexOf("=")) {
     throw new IllegalArgumentException("Argument selector part: " + query + " is illegal");
   }
   String[] map = value.split("=");
   Elements eles = new Elements();
   for (Element element : elements) {
     if (element.hasAttr(map[0]) && element.attr(map[0]).equals(map[1])) {
       eles.add(element);
     }
   }
   return eles;
 }
Ejemplo n.º 20
0
 /**
  * @param sspHandler
  * @param el
  * @param linkText
  * @return
  */
 private TestSolution testTitleAttributeLink(SSPHandler sspHandler, Element el, String linkText) {
   // if the current has no title or has an empty title or has a title
   // content identical to the link text, returns not applicable.
   if (!el.hasAttr(TITLE_ATTR)) {
     return TestSolution.NOT_APPLICABLE;
   }
   String attrValue = el.attr(TITLE_ATTR);
   if (StringUtils.isBlank(attrValue)) {
     return TestSolution.NOT_APPLICABLE;
   }
   if (StringUtils.equalsIgnoreCase(attrValue, linkText)) {
     return TestSolution.NOT_APPLICABLE;
   }
   ElementHandler<Element> elHandler = new ElementHandlerImpl(el);
   TestSolutionHandler tsHandler = new TestSolutionHandlerImpl();
   titlePertinenceElementChecker.check(sspHandler, elHandler, tsHandler);
   return tsHandler.getTestSolution();
 }
 /**
  * Check if an HTML input is an image input type.
  *
  * @param element
  * @return
  */
 static boolean isImageInput(Element element) {
   return element.nodeName().equals(INPUT)
       && element.hasAttr(TYPE)
       && element.attr(TYPE).equals(IMAGE);
 }
Ejemplo n.º 22
0
  public void crawl() {

    int movieError = 0;
    int totalMovies = 0;
    int first = 1;
    File file;
    Document doc;
    Elements newsHeadlines;
    Elements reviews;
    Elements title;
    Elements synopsis;
    Elements spans;
    Elements audienceReviews;
    FileWriter writer;
    long startTime = System.currentTimeMillis();
    try {
      System.out.println("INFO: Create DataSet");

      System.out.println("INFO: Get Movies List");
      doc =
          Jsoup.connect("http://www.rottentomatoes.com/movie/allMovies/").timeout(10 * 1000).get();
      newsHeadlines = doc.select("loc");
      System.out.println("INFO: Find " + newsHeadlines.size() + " movies");
      //			System.out.println(newsHeadlines.get(0).text());
      //
      totalMovies = newsHeadlines.size();
      for (int i = 0; i < newsHeadlines.size(); i++) {

        System.out.println("INFO: Movie " + i + " of " + totalMovies);
        System.out.println("INFO: Add Movie");
        // writer.append("//\n//\n");

        // Get movie Page
        try {
          doc = Jsoup.connect(newsHeadlines.get(i).text()).timeout(10 * 1000).get();
        } catch (HttpStatusException ex) {
          movieError++;
          continue;
        }

        // Get Movie Title
        title = doc.select("h1.movie_title");
        file =
            new File(
                "movies2/"
                    + title.text().toString().replaceAll("\\<.*?>", "").replaceAll(" ", "_")
                    + ".txt");
        if (!file.exists()) {
          try {
            file.createNewFile();
          } catch (IOException ioe) {
            System.out.println("ERROR: IOException - " + ioe.getMessage());
            continue;
          }
        } else {
          continue;
        }

        writer =
            new FileWriter(
                "movies2/"
                    + title.text().toString().replaceAll("\\<.*?>", "").replaceAll(" ", "_")
                    + ".txt");

        System.out.println(
            "INFO: Movie Title - " + title.text().toString().replaceAll("\\<.*?>", ""));
        writer.append(
            "{\n\t\"title\": \"" + title.text().toString().replaceAll("\"", "") + "\",\n");
        writer.flush();

        // Get Movie Synopsis
        synopsis = doc.select("#movieSynopsis");
        System.out.println("INFO: Get Synopsis");
        writer.append(
            "\t\"synopsis\":\"" + synopsis.text().toString().replaceAll("\"", "") + "\",\n");
        writer.flush();

        // Get Movie Genres
        System.out.println("INFO: Get Genre");
        spans = doc.select("span");
        writer.append("\t\"genre\":\n\t[\n");
        first = 1;
        for (Element e : spans) {
          if (e.hasAttr("itemprop")) {
            if (e.attr("itemprop").equals("genre")) {
              if (first == 1) {
                writer.append(
                    "\t\t{\"name\":\"" + e.text().toString().replaceAll("\"", "") + "\"}");
                first = 0;
              } else {
                writer.append(
                    ",\n\t\t{\"name\":\"" + e.text().toString().replaceAll("\"", "") + "\"}\n");
              }

              writer.flush();
            }
          }
        }
        writer.append("\n\t],\n"); // End of Genre
        writer.flush();

        // Get Movie Audience Review
        System.out.println("INFO: Get Audience Review");
        audienceReviews = doc.select("p.critic_stats");
        if (audienceReviews.size() > 2) {
          String review = audienceReviews.get(2).text();
          review = review.replace("liked it ", "");

          StringTokenizer st = new StringTokenizer(review, " ");
          String key = null;
          while (st.hasMoreTokens()) {
            key = st.nextToken();
            if (key.contains("/")) {
              st = new StringTokenizer(key, "/");
              writer.append("\t\"rate\":" + st.nextToken() + ",\n");
              writer.flush();
            }
          }
        } else {
          writer.append("\t\"rate\":0,\n");
          writer.flush();
        }

        try {
          int pageNum = 1;
          System.out.println("INFO: Start Get comments");
          writer.append("\t\"comments\":[\n");
          first = 1;
          do {
            doc =
                Jsoup.connect(newsHeadlines.get(i).text() + "/reviews/?type=user&page=" + pageNum)
                    .timeout(10 * 1000)
                    .get();
            reviews = doc.select(".user_review");
            Elements score = null;
            ;
            if (reviews.size() != 0) {
              for (Element el : reviews) {
                score = el.select("div.scoreWrapper");

                String rating =
                    score
                        .select("span")
                        .removeClass("small")
                        .removeClass("rating")
                        .removeClass("stars")
                        .attr("class");
                el.select("span").remove();
                el.select(".scoreWrapper").remove();
                el.select("br").remove();
                if (first == 1) {
                  writer.append(
                      "\t\t{\n\t\t\t\"rate\":"
                          + getCommentScore(rating)
                          + ",\n\t\t\t\"text\": \""
                          + el.text().toString().replaceAll("\"", "")
                          + "\"\n}");
                  first = 0;
                } else {
                  writer.append(
                      ",\n\t\t{\n\t\t\t\"rate\":"
                          + getCommentScore(rating)
                          + ",\n\t\t\t\"text\": \""
                          + el.text().toString().replaceAll("\"", "")
                          + "\"\n}");
                }
                writer.flush();
              }
              //							System.out.println("------------------------");
              System.out.println("INFO: Comment Page: " + pageNum);
              pageNum++;
            } else {
              writer.append("\t]\n}");
              System.out.println("INFO: End Get Comments\n\n");
              break;
            }
          } while (true);
        } catch (IOException e1) {
          System.out.println("ERROR: " + e1.getMessage());
          e1.printStackTrace();
        } catch (Exception e) {
          System.out.println("ERROR: " + e.getMessage());
          e.printStackTrace();
          writer.append("\t]\n}");
          continue;
        }
        writer.close();
      }

      System.out.println("INFO: Done!");
      System.out.println("INFO: Failed to parse" + movieError);
      long endTime = System.currentTimeMillis();
      long totalTime = endTime - startTime;
      NumberFormat formatter = new DecimalFormat("#0.00000");
      System.out.println("INFO: Total Time: " + formatter.format((totalTime) / 1000d) + " s");
    } catch (IOException e) {
      System.out.println("ERROR: " + e.getMessage());
      e.printStackTrace();
    }
  }
Ejemplo n.º 23
0
  public static void getLessonsDetails(Document doc, Schedule s) throws ParseException {

    final SimpleDateFormat fmt = new SimpleDateFormat("dd.MM.yyyy", Locale.ENGLISH);
    fmt.setTimeZone(TimeZone.getTimeZone("Europe/Moscow"));

    Elements tableCells = doc.getElementsByAttributeValue("class", "table diary");
    for (Element tableCell : tableCells) {

      int tdCount = 0;
      Date date = null;
      Lesson l, lPrev = null; // lPrev to handle duplicate lesson
      int sameLesson = 0; // Also to handle duplicate lesson

      Elements trs = tableCell.getElementsByTag("tr");
      for (Element tr : trs) {

        if (tr.hasAttr("class") && tr.attr("class").equals("table-header")) continue;

        l = null;
        sameLesson = 0; // assume no bug here
        Elements tds = tr.getElementsByTag("td");

        for (Element td : tds) {

          if (td.hasAttr("class") && td.attr("class").equals("date")) {

            date = fmt.parse(td.getElementsByTag("div").first().text());
            tdCount = 1;

          } else if (td.hasAttr("class") && td.attr("class").equals("diary-mark")) {

            String marks = fetchLongCellStringNoWhitespaces(td);
            if (l != null && marks != null) {

              if (sameLesson > 0 && lPrev != null) {

                l.setMarks(fixDuplicateString(marks, lPrev.getMarks(), sameLesson));
              } else l.setMarks(marks);
            }
            tdCount++;

          } else if (td.hasAttr("class") && td.attr("class").equals("diary-comment")) {

            String comment = fetchLongCellStringNoWhitespaces(td);
            if (l != null && comment != null) {

              if (sameLesson > 0 && lPrev != null) {

                l.setComment(fixDuplicateString(comment, lPrev.getComment(), sameLesson));
              } else l.setComment(comment);
            }

            tdCount++;

          } else if (tdCount == 2) {

            String theme = fetchLongCellStringNoWhitespaces(td);
            if (l != null && theme != null) {

              if (sameLesson > 0 && lPrev != null) {

                l.setTheme(fixDuplicateString(theme, lPrev.getTheme(), sameLesson));
              } else l.setTheme(theme);
            }
            tdCount++;

          } else if (tdCount == 3) {

            String homework = fetchLongCellStringNoWhitespaces(td);
            if (l != null && homework != null) {

              if (sameLesson > 0 && lPrev != null) {

                l.setHomework(fixDuplicateString(homework, lPrev.getHomework(), sameLesson));
              } else l.setHomework(homework);
            }
            tdCount++;

          } else if (SUBJECT_NAME.matcher(td.text()).find()) {

            tdCount = 2;
            int number = Integer.parseInt(td.text().substring(0, 1));
            l = s.getLessonByNumber(date, number);

            if (lPrev != null
                && l != null
                && l.getStart().equals(lPrev.getStart())
                && l.getNumber() == lPrev.getNumber()) {

              // We hit the same lesson bug
              sameLesson++;
            }

          } else {
            tdCount++;
          }
        }

        if (l != null) {
          lPrev = l;
          l.update();
        }
      }
    }
  }
Ejemplo n.º 24
0
  public static void getGrades(Document doc, Schedule sch, GradeSemester s) throws ParseException {

    mNewMarks = new ArrayList<MarkRec>();

    Elements tableCells = doc.getElementsByAttributeValue("class", "table rating");
    for (Element tableCell : tableCells) {

      Elements trs = tableCell.getElementsByTag("tr");
      for (Element tr : trs) {

        if (tr.hasAttr("class") && tr.attr("class").equals("table-header")) continue;

        GradeRec rec = new GradeRec();
        int thCount = 0;

        Elements ths = tr.getElementsByTag("th");
        for (Element th : ths) {

          if (th.hasAttr("class") && th.attr("class").equals("table-header3")) {

            rec.setFormText(th.text());
            thCount = 2;

          } else if (th.hasAttr("class") && th.attr("class").equals("cell-header2")) {

            switch (thCount) {
              case 2:
                if (containsPrintableChars(th.text())) rec.setAbsent(Integer.parseInt(th.text()));
                break;
              case 3:
                if (containsPrintableChars(th.text())) rec.setReleased(Integer.parseInt(th.text()));
                break;
              case 4:
                if (containsPrintableChars(th.text())) rec.setSick(Integer.parseInt(th.text()));
                break;
              case 5:
                if (containsPrintableChars(th.text()))
                  rec.setAverage(Float.parseFloat(th.text().replace(',', '.')));
                break;
            }

            thCount++;
          }
        }

        Element total = tr.getElementsByTag("td").last();
        if (containsPrintableChars(total.text()) && total.text().matches("[-+]?\\d*\\.?\\d+")) {

          rec.setTotal(Integer.parseInt(total.text()));
        }

        rec.setStart(s.getStart());
        rec.setStop(s.getStop());

        if (containsPrintableChars(rec.getFormText())) {

          GradeRec exR = sch.getGradeRecByDateText(rec.getStart(), rec.getFormText());
          if (exR != null) {

            //						if (BuildConfig.DEBUG)
            //							Log.d("GshisHTMLParser",
            //									TS.get()
            //											+ " before update GradeRec, start = "
            //											+ exR.getStart() + " stop = "
            //											+ exR.getStop() + " text = "
            //											+ exR.getFormText());

            exR.setAbsent(rec.getAbsent());
            exR.setAverage(rec.getAverage());
            exR.setReleased(rec.getReleased());
            exR.setSick(rec.getSick());
            exR.setTotal(rec.getTotal());

            // make sure we have only fresh marks
            exR.deleteMarks();

            @SuppressWarnings("unused")
            long u = exR.update();
            rec = exR;

            //						if (BuildConfig.DEBUG)
            //							Log.d("GshisHTMLParser", TS.get()
            //									+ " GradeRec.update() = " + u);
          } else {
            //						if (BuildConfig.DEBUG)
            //							Log.d("GshisHTMLParser", TS.get()
            //									+ " insert GradeRec = " + rec);

            sch.addGradeRec(rec);
          }

          for (Element td : tr.getElementsByTag("td")) {

            if (td.hasAttr("class") && td.attr("class").equals("grade-with-type")) {

              Element span = td.getElementsByTag("span").first();

              if (containsPrintableChars(span.text())
                  && containsPrintableChars(span.attr("title"))) {

                MarkRec mr = rec.getMarkRecByComment(span.attr("title"));
                if (mr != null) {

                  mr.setMarks(span.text());

                  @SuppressWarnings("unused")
                  long u = mr.update();

                  //									if (BuildConfig.DEBUG)
                  //										Log.d("GshisHTMLParser", TS.get() + " MarkRec.update() = " + u
                  //												+ " rec = " + rec);
                } else {

                  mr = new MarkRec(span.text(), span.attr("title"));

                  mNewMarks.add(mr);
                  rec.addMarcRec(mr);

                  //									if (BuildConfig.DEBUG)
                  //										Log.d("GshisHTMLParser", TS.get()
                  //												+ " insert MarkRec Comment = " + mr.getComment() + " Marks = "
                  //												+ mr.getMarks());
                }
              }
            }
          }
        }
      }
    }
  }
Ejemplo n.º 25
0
  public static void getLessons(Document doc, Schedule s) throws ParseException {

    final SimpleDateFormat format = new SimpleDateFormat("dd.MM.yyyy HH:mm", Locale.ENGLISH);
    format.setTimeZone(TimeZone.getTimeZone("Europe/Moscow"));

    Elements lessonCells = doc.getElementsByAttribute("number");

    for (Element lessonCell : lessonCells) {

      Lesson l, lPrev = null; // lPrev to handle duplicate lesson
      int sameLesson = 0; // Also to handle duplicate lesson

      int number = Integer.parseInt(lessonCell.attr("number"));
      String time = "";

      Elements timeDetails = lessonCell.getElementsByClass("cell-header2");
      for (Element timeDetail : timeDetails) {
        if (timeDetail.hasAttr("style")) time = timeDetail.text();
      }

      Elements lessonCellDetails = lessonCell.getElementsByAttribute("jsdate");
      for (Element lessonCellDetail : lessonCellDetails) {

        String date = lessonCellDetail.attr("jsdate");
        int index = 0;
        sameLesson = 0;

        for (Element subject :
            lessonCellDetail.getElementsByAttributeValue("class", "lesson-subject")) {

          if (subject == null || subject.text() == null || subject.text().length() <= 0) {
            // No lesson scheduled
            continue;
          }

          Date start = format.parse(date + " " + time.substring(0, time.indexOf("-") - 1));
          if ((l = s.getLessonByNumber(start, number)) == null) {

            if (BuildConfig.DEBUG)
              Log.d("GshisHTMLParser", TS.get() + " getLessons() not found in db, will insert");

            l = new Lesson();
            sameLesson = 0;

            l.setStart(start);
            l.setStop(
                format.parse(date + " " + time.substring(time.indexOf("-") + 2, time.length())));
            l.setFormId(subject.attr("id"));
            l.setFormText(subject.text());
            l.setTeacher(
                lessonCellDetail
                    .getElementsByAttributeValue("class", "lesson-teacher")
                    .get(sameLesson)
                    .text());
            l.setNumber(number);

            s.addLesson(l);

          } else {

            if (BuildConfig.DEBUG)
              Log.d("GshisHTMLParser", TS.get() + " getLessons() found in db, will update");

            l.setFormId(subject.attr("id"));

            if (lPrev != null && lPrev.getStart().equals(start) && lPrev.getNumber() == number) {

              if (BuildConfig.DEBUG)
                Log.d(
                    "GshisHTMLParser",
                    TS.get()
                        + " getLessons() dup = "
                        + subject.text()
                        + " index = "
                        + index
                        + " sameLesson = "
                        + sameLesson);

              sameLesson++;

              if (!lPrev.getFormText().equals(subject.text()))
                l.setFormText(fixDuplicateString(subject.text(), lPrev.getFormText(), sameLesson));

              String teacher =
                  lessonCellDetail
                      .getElementsByAttributeValue("class", "lesson-teacher")
                      .get(index)
                      .text();

              if (!lPrev.getTeacher().equals(teacher))
                l.setTeacher(fixDuplicateString(teacher, lPrev.getTeacher(), sameLesson));

            } else {

              l.setNumber(number);
              l.setFormText(subject.text());
              l.setTeacher(
                  lessonCellDetail
                      .getElementsByAttributeValue("class", "lesson-teacher")
                      .get(index)
                      .text());
            }

            l.update();
          }

          lPrev = l;
          index++;
        }
      }
    }
  }
 private Element extend(Element sup, Element sub) {
   // Get the child elements for both the sup (super) element and the sub
   // (extended) element.
   Elements subElements = sub.children();
   Elements supElements = sup.children().clone();
   // For each element in the sub group,
   loop:
   for (Element e : subElements) {
     // If it's overridden, delete it from sup.
     if (e.hasAttr(Language.OVERRIDE_ATTRIBUTE)) {
       for (Element el : supElements) {
         if (el.attr(Language.IDENTIFICATION_ATTRIBUTE)
             .equals(e.attr(Language.IDENTIFICATION_ATTRIBUTE))) {
           supElements.remove(el);
           continue loop;
         }
       }
       // Fail silently if no element is found to override.
       continue loop;
     } else if (Language.isOverridden(e)) {
       // Some elements are automatically overridden if they exist.
       for (Element el : supElements) {
         if (el.tagName().equals(e.tagName())) {
           supElements.remove(el);
           continue loop;
         }
       }
       // Fail silently if no element is found to override.
       continue loop;
     } else if (e.tagName().equals("meta")) { // If this is a meta tag,
       if (e.hasAttr("name")) { // If it's got a name,
         Elements metaThatMatch =
             supElements.select(
                 "meta[name=\""
                     + e.attr("name")
                     + "\"]"); // Find and override the meta tag in supElements with that name.
         if (metaThatMatch.size() == 1) {
           supElements.remove(supElements.indexOf(metaThatMatch.first()));
         }
       } else if (e.hasAttr("http-equiv")) { // If it's got a http-equiv,
         Elements metaThatMatch =
             supElements.select(
                 "meta[http-equiv=\""
                     + e.attr("http-equiv")
                     + "\"]"); // Find and override the meta tag in supElements with that
                               // http-equiv.
         if (metaThatMatch.size() == 1) {
           supElements.remove(supElements.indexOf(metaThatMatch.first()));
         }
       }
     } else {
       // If it's not overridden but does correspond to an element,
       // recursively extend it.
       for (Element el : supElements) {
         if (el.hasAttr(Language.IDENTIFICATION_ATTRIBUTE)
             && el.attr(Language.IDENTIFICATION_ATTRIBUTE)
                 .equals(e.attr(Language.IDENTIFICATION_ATTRIBUTE))) {
           Element temp = extend(el.clone(), e.clone()).clone();
           e.replaceWith(temp);
           supElements.remove(el);
           continue loop;
         }
       }
     }
   }
   // Add the elements from the sup to the beginning of sub. This is where
   // the real extension happens.
   Collections.reverse(supElements);
   for (Element e : supElements) {
     sub.prependChild(e.clone());
   }
   return sub;
 }
 public boolean matches(Element element, Element element1) {
   return element1.hasAttr(key) && element1.attr(key).toLowerCase().endsWith(value);
 }