예제 #1
0
  private JCas computeCommentCas(Element comment) throws UIMAException {
    JCas cCas = JCasFactory.createJCas();
    String cid = comment.attr("CID");
    String cuserid = comment.attr("CUSERID");
    // String cgold = comment.attr("CGOLD");
    // String cgold = getgold(comment.attr("CGOLD"));

    // String cgold_yn = comment.attr("CGOLD_YN");
    String csubject = comment.getElementsByTag("CSubject").get(0).text();
    String cbody = comment.getElementsByTag("CBody").get(0).text();

    /** Setup comment CAS */
    cCas.reset();
    cCas.setDocumentLanguage("en");
    String commentText =
        TextNormalizer.normalize(SubjectBodyAggregator.getCommentText(csubject, cbody));
    cCas.setDocumentText(commentText);
    // cCas.setDocumentText(csubject + ". " + cbody);

    /** Run the UIMA pipeline */
    SimplePipeline.runPipeline(cCas, this.analysisEngineList);

    // this.analyzer.analyze(commentCas, new SimpleContent("c-" + cid, csubject + ". " + cbody));
    return cCas;
  }
예제 #2
0
  /**
   * This method will try to find the X1 and X2 of the column. It does this by looping trough the
   * partitions and check if the X1 and X2 are currently the highest or lowest values. It will set
   * this value in the private columnBoundaryX1 and columnBoundaryX2 variables.
   */
  private void findColumnBoundaries() {
    int columnBoundaryX1 = Integer.MAX_VALUE;
    int columnBoundaryX2 = Integer.MIN_VALUE;
    for (ArrayList<Element> cell : cells) {
      String pos;
      String[] positions;

      Element firstWordInCell = cell.get(0);
      Element lastWordInCell = cell.get(cell.size() - 1);

      pos = firstWordInCell.attr("title");
      positions = pos.split("\\s+");
      int x1 = Integer.parseInt(positions[1]);
      if (x1 < columnBoundaryX1) {
        columnBoundaryX1 = x1;
        this.columnBoundaryX1 = x1;
      }

      pos = lastWordInCell.attr("title");
      positions = pos.split("\\s+");
      int x2 = Integer.parseInt(positions[3]);
      if (x2 > columnBoundaryX2) {
        columnBoundaryX2 = x2;
        this.columnBoundaryX2 = x2;
      }
    }
  }
예제 #3
0
  // parse from html element div#feed_item
  public MeipinItem(Element root) {
    id = root.attr("id");

    Element imga =
        root.child(0) // div#item_inner
            .child(0) // div#item_img
            .child(0); // a

    this.thumbnailUri =
        imga.child(0) // img
            .attr("src");

    if (imga.attributes().hasKey("onmouseover")) {
      String tmp = imga.attr("onmouseover");
      int start = tmp.indexOf(",") + 2;
      int end = tmp.indexOf("'", start);
      this.uri = tmp.substring(start, end);
      this.isGif = true;
    } else this.uri = this.thumbnailUri;

    if (this.uri.contains(".gif")) this.isGif = true;

    Element item_info =
        root.child(0) // div#item_inner
            .child(1); // div#item_info

    this.title =
        item_info
            .child(1) // h3
            .child(0) // a
            .attr("title");
  }
예제 #4
0
  /**
   * 解析数据,默认解析第一列
   *
   * @param rows 源数据集
   * @return 节目数据
   */
  private static String[][] parseRows(Elements rows) {
    String[][] programs = new String[rows.size()][2];
    int rowspan_0 = 0;
    int rowspan_1 = 0;
    for (int i = 0; i < rows.size(); i++) {
      Element row = rows.get(i);
      try {
        Elements cells = row.children();

        if (rowspan_0 == 0) {
          Element cell_0 = cells.get(0);
          rowspan_0 = Integer.valueOf(cell_0.attr("rowspan"));
          if (rowspan_1 == 0) {
            Element cell_1 = cells.get(1);
            rowspan_1 = Integer.valueOf(cell_1.attr("rowspan"));
            programs[i][0] = DBclass.xmlFilte(cell_1.select("dt").text());
            programs[i][1] = DBclass.xmlFilte(cell_1.select("dd").text());
          }
        } else if (rowspan_1 == 0) {
          Element cell_0 = cells.get(0);
          rowspan_1 = Integer.valueOf(cell_0.attr("rowspan"));
          programs[i][0] = DBclass.xmlFilte(cell_0.select("dt").text());
          programs[i][1] = DBclass.xmlFilte(cell_0.select("dd").text());
        }
        rowspan_0--;
        rowspan_1--;
      } catch (Exception e) {
        e.printStackTrace(System.out);
      }
    }
    return programs;
  }
예제 #5
0
  @Override
  protected List<Image> getImagesToAlbum(Album album) {

    setImagesList(new ArrayList<Image>());
    setDocument(ParserManagement.getDocument(album.getUrl()));

    if (notNull(getDocument())) {
      Elements imagesElements =
          getDocument().select("[data-role=content]").get(0).select("img[src]");

      if (notNull(imagesElements)) {
        for (Element imageElement : imagesElements) {

          String imageUrl = imageElement.attr("src");
          if (!imageUrl.endsWith(".jpg")) {
            imageUrl = imageElement.attr("data-lazyload-src");
          }

          String width = imageElement.attr("width");
          String height = imageElement.attr("height");
          String alt = imageElement.attr("alt");

          if (notNullAndNotIsEmpty(imageUrl) && imageUrl.endsWith(".jpg")) {
            getImagesList()
                .add(new Image(imageUrl, getAuthor(), width, height, alt, getPortfolio(), album));
          }
        }
      }
    }

    return getImagesList();
  }
예제 #6
0
  public String reviseImgForWX(String pcont) {
    if (pcont == null) return "";

    Document doc = Jsoup.parse(pcont);
    Elements eleimages = doc.select("img");
    if (eleimages.size() > 0) {
      for (Element img : eleimages) {
        String source = img.attr("data-src");
        int pos = source.lastIndexOf("/") + 1;
        source = source.substring(0, pos);
        img.removeAttr("data-s");
        img.removeAttr("data-src");
        img.removeAttr("data-w");
        img.attr("src", source + "640");
        img.attr("max-width", "640");
      }
    }
    Elements elesrp = doc.select("script");
    Elements divs = doc.select("div");
    if (elesrp.size() > 0 && divs.size() > 0) {

      for (Element ele : elesrp) {
        String s = ele.html();
        Pattern p = Pattern.compile("(?<=(var\\scover\\s=\\s\"))\\S+(?=\")");
        Matcher m = p.matcher(s);
        if (m.find()) {
          String nimg = "<img src=\"" + m.group() + "\"/>";
          divs.get(0).before(nimg);
        }
      }
    }
    return doc.html();
  }
  @Override
  protected void initialize(Element source) {
    Elements elements = source.getElementsByTag("td");

    Element element = elements.get(0).select("[data-sc-params]").get(0);
    String name =
        element
            .attr("data-sc-params")
            .replaceAll("\\{ 'name': '", "")
            .replaceAll("', 'magnet':.*", "")
            .replaceAll("%20", "\\.")
            .replaceAll("%5B.*", "");

    ShowData showData = ShowData.fromFilename(name);
    initialize(showData);

    seeds = Integer.parseInt(elements.get(4).text());
    peers = Integer.parseInt(elements.get(5).text());

    element = elements.get(0).select("div a[title=Download torrent file]").get(0);
    String[] array = element.attr("href").split("\\?");
    downloadLink = array[0].replaceAll("\\.torrent", "/temp\\.torrent");

    if (downloadLink.startsWith("//")) {
      downloadLink = "http:" + downloadLink;
    }
  }
예제 #8
0
파일: Tree.java 프로젝트: morenos/vaadin
  /**
   * Recursively writes a data source Item and its children to a design.
   *
   * @since 7.5.0
   * @param design the element into which to insert the item
   * @param itemId the id of the item to write
   * @param context the DesignContext instance used in writing
   * @return
   */
  @Override
  protected Element writeItem(Element design, Object itemId, DesignContext context) {
    Element element = design.appendElement("node");

    element.attr("text", itemId.toString());

    Resource icon = getItemIcon(itemId);
    if (icon != null) {
      DesignAttributeHandler.writeAttribute(
          "icon", element.attributes(), icon, null, Resource.class);
    }

    if (isSelected(itemId)) {
      element.attr("selected", "");
    }

    Collection<?> children = getChildren(itemId);
    if (children != null) {
      // Yeah... see #5864
      for (Object childItemId : children) {
        writeItem(element, childItemId, context);
      }
    }

    return element;
  }
예제 #9
0
  private static String replaceCidWithAttachments(
      String html, Map<String, Attachment> attachments) {
    Document doc = Jsoup.parse(html);
    String[] attrNames = {"src", "href"};

    for (String attrName : attrNames) {
      Elements tags = doc.select("*[" + attrName + "]");
      for (Element tag : tags) {
        String uriString = tag.attr(attrName).trim();

        if (!uriString.toLowerCase().startsWith("cid:")) {
          continue;
        }

        String cid = uriString.substring("cid:".length());

        if (!attachments.containsKey(cid)) {
          continue;
        }

        Long id = attachments.get(cid).id;
        tag.attr(attrName, controllers.routes.AttachmentApp.getFile(id).url());
      }
    }

    Elements bodies = doc.getElementsByTag("body");

    if (bodies.size() > 0) {
      return bodies.get(0).html();
    } else {
      return doc.html();
    }
  }
  @Override
  public String extract(AttributeDetail attributeDetail, String url, Document articleDoc)
      throws Exception {

    String attrib_value = null;

    if ((attributeDetail.extractor_args != null) && (attributeDetail.extractor_args.size() >= 2)) {
      String cssSelector = attributeDetail.extractor_args.get(0);
      Element element = articleDoc.select(cssSelector).first();
      if (StringUtils.isNotBlank(cssSelector)) {
        int arg_count = 0;
        for (String value_name : attributeDetail.extractor_args) {
          if (arg_count > 0) { // skip the first one, its the cssSelector
            if (element != null && element.attr(value_name) != null) {
              String rawList = element.attr(value_name);
              if (StringUtils.isNotBlank(rawList)) {
                String[] parts = rawList.split(",");
                for (int i = 0; i < parts.length; i++) {
                  parts[i] = parts[i].trim().toLowerCase();
                }
                attrib_value = StringUtils.join(parts, ',');
                break;
              }
            }
          }
          arg_count++;
        }
      }
    }

    return attrib_value;
  }
예제 #11
0
 private Response postToLogin(String username, String password, String[] captchaData)
     throws ConnectionException {
   try {
     Map<String, String> data = new HashMap<>();
     Document loginDocument = Jsoup.connect(Endpoints.LOGIN_URL.url()).get();
     Element loginForm = loginDocument.getElementById("loginForm");
     for (Element input : loginForm.getElementsByTag("input")) {
       data.put(input.attr("name"), input.attr("value"));
     }
     Date now = new Date();
     data.put("timezone_field", new SimpleDateFormat("XXX").format(now).replace(':', '|'));
     data.put("username", username);
     data.put("password", password);
     data.put("js_time", String.valueOf(now.getTime() / 1000));
     if (captchaData.length > 0) {
       data.put("hip_solution", captchaData[0]);
       data.put("hip_token", captchaData[1]);
       data.put("fid", captchaData[2]);
       data.put("hip_type", "visual");
       data.put("captcha_provider", "Hip");
     } else {
       data.remove("hip_solution");
       data.remove("hip_token");
       data.remove("fid");
       data.remove("hip_type");
       data.remove("captcha_provider");
     }
     return Jsoup.connect(Endpoints.LOGIN_URL.url()).data(data).method(Method.POST).execute();
   } catch (IOException e) {
     throw ExceptionHandler.generateException("While submitting credentials", e);
   }
 }
예제 #12
0
 public static String getType(Document doc) {
   String type = "";
   if (doc.select("#kw").size() > 0) {
     Element e = doc.select("#kw").get(0);
     StringBuilder typeURL = new StringBuilder();
     typeURL.append("http://widget.unistats.ac.uk/Widget/");
     typeURL.append(e.attr("data-institution") + "/");
     typeURL.append(e.attr("data-course") + "/");
     typeURL.append(e.attr("data-orientation") + "/");
     typeURL.append("null/");
     typeURL.append(e.attr("data-language") + "/");
     typeURL.append(e.attr("data-kismode"));
     boolean finishe = false;
     try {
       do {
         Connection tmpConn = Jsoup.connect(typeURL.toString());
         Document tmpDoc = tmpConn.timeout(10000).get();
         if (tmpDoc.select("#kisWidget > div.widgetCourse > h1").size() > 0) {
           e = tmpDoc.select("#kisWidget > div.widgetCourse > h1").get(0);
           type =
               e.text().trim().indexOf(" ") > 0
                   ? e.text().trim().substring(0, e.text().trim().indexOf(" "))
                   : e.text().trim();
         }
         finishe = true;
       } while (!finishe);
     } catch (Exception ex) {
       ex.printStackTrace();
     }
   }
   return type;
 }
예제 #13
0
 public static boolean getFormFields(
     ResponseWrapper rw, List<NameValuePairString> hiddenFormFields, String formSelector) {
   // --- analisi della pagina contente la form, specifica al sito
   Document doc = rw.getJSoupDocument();
   Elements els = doc.select(formSelector); // per debug, dovrebbe essere uo
   if (els == null || els.size() <= 0) {
     log.error("unable to find form at selector: " + formSelector);
     System.exit(1);
     return false;
   }
   Element loginForm = els.get(0);
   if (loginForm == null) {
     log.error("failed to get form to analyze at: " + rw.dump());
     System.exit(1);
   }
   // log.info("login form OUTER HTML\n" + loginForm.outerHtml());
   Elements inputFields = loginForm.select("input");
   // display all
   for (Element e : inputFields) {
     String type = e.attr("type");
     if (type.equals("submit")) {
       continue;
     }
     String attrName = e.attr("name");
     hiddenFormFields.add(new NameValuePairString(attrName, e.val()));
     log.debug("captured form input: " + attrName + " = " + e.val());
   }
   return false;
 }
예제 #14
0
파일: Crawler.java 프로젝트: KevalS/Crawler
  public static void processPage(String URL) throws SQLException, IOException {
    // check if the given URL is already in database
    String sql = "select * from Record where URL = '" + URL + "'";
    ResultSet rs = db.runSql(sql);
    if (rs.next()) {

    } else {
      // store the URL to database to avoid parsing again
      sql = "INSERT INTO  test.Record " + "(URL) VALUES " + "(?);";
      PreparedStatement stmt = db.conn.prepareStatement(sql, Statement.RETURN_GENERATED_KEYS);
      stmt.setString(1, URL);
      stmt.execute();

      // get useful information
      Document doc = Jsoup.connect("http://www.mit.edu/").get();

      if (doc.text().contains("PhD")) {
        System.out.println(URL);
      }

      // get all links and recursively call the processPage method
      Elements questions = doc.select("a[href]");
      for (Element link : questions) {
        if (link.attr("href").contains("mit.edu")) processPage(link.attr("abs:href"));
      }
    }
  }
예제 #15
0
 @Override
 public void upload(
     ComponentParameter compParameter,
     IMultipartFile multipartFile,
     HashMap<String, Object> json) {
   try {
     ID id = ItSiteUtil.getLoginUser(compParameter).getId();
     if (id != null) {
       final Document document =
           Jsoup.parse(
               multipartFile.getInputStream(), compParameter.request.getCharacterEncoding(), "");
       final Elements as = document.getElementsByTag("a");
       for (final Element a : as) {
         if (a.hasAttr("add_date")) {
           final BookmarkBean bean = new BookmarkBean();
           final long t = ConvertUtils.toLong(a.attr("add_date"), 0) * 1000;
           bean.setTitle(a.text());
           bean.setUrl(a.attr("href"));
           bean.setUserId(id);
           bean.setUpdateDate(new Date(t));
           try {
             BookmarkUtils.applicationModule.doUpdate(bean);
           } catch (Exception e) {
           }
         }
       }
     }
   } catch (final Exception e) {
     throw DataObjectException.wrapException("没有权限");
   }
 }
예제 #16
0
  private RawBankAccount obtainBankAccountFromHtmlTableRow(String type, Element row) {
    if ("detail".equalsIgnoreCase(row.attr("class"))) {
      // detail row
      return null;
    }

    if ("bg0".equalsIgnoreCase(row.attr("class"))) {
      Log.v(TAG, "working row(" + type + "): " + row.html());

      if ("Current Accounts".equalsIgnoreCase(type)) {
        return new RawBankAccount()
            .setServerId(row.child(2).text())
            .setName(row.child(0).child(0).text())
            .setIBAN(row.child(2).text())
            .setCurrency(row.child(1).text())
            .setBalance(Convert.strToFloat(row.child(3).text()))
            .setAvailableBalance(Convert.strToFloat(row.child(4).text()));
      } else if ("Cards".equalsIgnoreCase(type)) {
        // skip cards for now
        return null;
      } else {
        // unknown type
        return null;
      }
    } else {
      return null;
    }
  }
예제 #17
0
 /**
  * Pulls a page and attempts to discover a feed for it via link[rel='alternate'].
  *
  * @param url The URL of the page to try and discover the feed for.
  * @return The feedsource if matched or created, may be null.
  * @throws ClientProtocolException If the page could not be pulled.
  * @throws IOException If the page could not be pulled.
  * @throws DataOperationException If a query could not be executed.
  */
 public FeedSource discover(final String url)
     throws ClientProtocolException, IOException, DataOperationException {
   log.fine("Discovering feed for " + url);
   try (final CloseableHttpClient client = HttpClientBuilder.create().build()) {
     final HttpGet get = new HttpGet(url);
     try (final CloseableHttpResponse response = client.execute(get)) {
       final String html = EntityUtils.toString(response.getEntity());
       final Document doc = Jsoup.parse(html);
       final Elements alternateLinks = doc.select("link");
       for (final Element alternateLink : alternateLinks) {
         if ("alternate".equals(alternateLink.attr("rel"))) {
           if ("application/rss+xml".equals(alternateLink.attr("type"))) {
             log.fine("Found rss link " + alternateLink.attr("href"));
             final String rss = alternateLink.attr("href");
             return this.feedSourceManager.findOrCreateByFeedUrl(rss);
           }
           log.fine("Found alternate link " + alternateLink.html());
         } else {
           log.fine("Found link " + alternateLink.html());
         }
       }
     }
   }
   return null;
 }
예제 #18
0
  public static Pupil getSelectedPupil(Document doc) throws ParseException {

    boolean found = false;
    Pupil p, selectedP = null;

    Elements pupilSelectors =
        doc.getElementsByAttributeValue("id", "ctl00_topMenu_pupil_drdPupils");
    for (Element pupilSelector : pupilSelectors) {

      Elements pupils = pupilSelector.getAllElements();
      for (Element pupil : pupils) {
        if (pupil.tagName().equals("option")) {

          String value = pupil.attr("value");

          found = true;
          if ((p = Pupil.getByFormId(value)) == null) {

            p = new Pupil(pupil.text(), value);
            long rowId = p.insert();

            if (BuildConfig.DEBUG)
              Log.d("GshisHTMLParser", TS.get() + " Pupil.insert() = " + rowId);
          }

          if (pupil.hasAttr("selected") && pupil.attr("selected").equals("selected")) {

            selectedP = p;
          }
        }
      }
    }

    if (!found) {

      if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " Alternative fields found!");

      Element userName = doc.getElementsByClass("user-name").first();
      Element userId = doc.getElementsByAttributeValue("id", "ctl00_topMenu_tbUserId").first();

      String name = userName.text();
      String id = userId.attr("value");

      if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " name=" + name + " id=" + id);

      if ((p = Pupil.getByFormId(id)) == null) {

        p = new Pupil(name, id);
        long rowId = p.insert();

        if (BuildConfig.DEBUG) Log.d("GshisParser", TS.get() + " Pupil.insert() = " + rowId);
      }

      selectedP = p;
    }

    if (selectedP == null) throw new ParseException("Pupils not found", 0);

    return selectedP;
  }
예제 #19
0
  /**
   * Take links from results and do pagination (max 7 times).
   *
   * @param document
   * @return
   */
  @Override
  public List<URL> getNextPages(Document document) {
    List<URL> urls = new ArrayList<>();

    // Collect rows with links to comparing offerts links
    Elements elements = document.select(PRODUCTS_ROW_QUERY + ":not([onclick])");

    for (Element element : elements) {
      String str = element.attr("abs:href");
      try {
        urls.add(Utils.stringToURL(str));
      } catch (ConnectionException e) {
      }
    }

    // Pagination
    final int MAX_PAGE = 7;
    Element next = document.select("a[href].next").first();
    if (next != null) {
      String nextStr = next.attr("href");
      if (!nextStr.contains("page_nr=" + MAX_PAGE)) {
        try {
          urls.add(Utils.stringToURL(nextStr));
        } catch (ConnectionException e) {
        }
      }
    }

    logger.debug("Collected " + urls.size() + " urls to visit");
    return urls;
  }
예제 #20
0
 public static void processEpub(String bookPath, String dest)
     throws FileNotFoundException, IOException {
   EpubReader reader = new EpubReader();
   Book b = reader.readEpub(new FileInputStream(new File(bookPath)));
   String content = "";
   int pagecount = 1;
   int tempCounter;
   Count cnt = new Count(0, 0);
   for (Resource res : b.getContents()) {
     content = new String(res.getData());
     Document doc = Jsoup.parse(content, "UTF-8");
     // http-equiv=\"content-type\" content=\"text/html; charset=utf-8\"");
     Element elem = new Element(Tag.valueOf("meta"), "");
     elem.attr("http-equiv", "content-type");
     elem.attr("content", "text/html; charset=utf-8");
     doc.head().after(elem);
     System.out.println(doc.head().data());
     Element ele = doc.body();
     alterElement(ele);
     Count cTemp = modify(ele, cnt);
     cnt.setCount(cTemp.getCount());
     cnt.setPgCount(cTemp.getPgCount());
     doc.body().html(ele.html());
     res.setData(doc.html().getBytes());
     if (res.getMediaType() == null) res.setMediaType(new MediaType("html", "html"));
   }
   EpubWriter wr = new EpubWriter();
   wr.write(b, new FileOutputStream(new File(dest)));
 }
예제 #21
0
  public static Week getSelectedWeek(Document doc, Schedule s) throws ParseException {

    boolean found = false;
    Week selectedW = null;

    SimpleDateFormat f = new SimpleDateFormat("yyyy dd.MM", Locale.ENGLISH);
    f.setTimeZone(TimeZone.getTimeZone("Europe/Moscow"));

    Elements weekSelectors = doc.getElementsByAttributeValue("id", "ctl00_body_week_drdWeeks");
    for (Element weekSelector : weekSelectors) {

      Elements weeks = weekSelector.getAllElements();
      for (Element week : weeks) {
        if (week.tagName().equals("option")) {

          String value = week.text();
          Week w;
          found = true;

          if ((w = s.getWeek(week.attr("value"))) == null) {

            w = new Week();

            String wBegin = value.substring(0, value.indexOf("-") - 1);
            String wMonth = wBegin.substring(wBegin.indexOf(".") + 1, wBegin.length());

            String year;
            if (Integer.parseInt(wMonth) > 7) {
              year = s.getFormText().substring(0, s.getFormText().indexOf("-") - 1);
            } else {
              year =
                  s.getFormText()
                      .substring(s.getFormText().indexOf("-") + 2, s.getFormText().length());
            }

            w.setStart(f.parse(year + " " + wBegin));
            w.setFormText(week.text());
            w.setFormId(week.attr("value"));

            s.addWeek(w);
          }

          if (week.hasAttr("selected") && week.attr("selected").equals("selected")) {

            selectedW = w;
            long u = w.setLoaded().update();

            if (BuildConfig.DEBUG) Log.d("GshisHTMLParser", TS.get() + " Week.update() = " + u);
          }
        }
      }
    }

    if (!found) throw new ParseException("Weeks not found", 0);

    return selectedW;
  }
예제 #22
0
  @SuppressWarnings("deprecation")
  @Override
  public void run() {
    // TODO Auto-generated method stub
    org.jsoup.nodes.Document doc = null;

    Elements links = null;

    File result = null;
    OutputStream out = null;
    BufferedWriter bw = null;
    int i = 0;
    try {
      for (i = start; i < end; i++) {

        String option = "&sourceMode=AUTO&repeatMode=FIRST_IN_REGION&minProbability=0";

        result = new File("file//boc//wikified(NYT)//" + (i + 1) + ".txt");
        out = new FileOutputStream(result, false);
        bw = new BufferedWriter(new OutputStreamWriter(out, "utf-8"));
        String[] passage = null;
        if (texts[i].contains("\n")) passage = texts[i].split("\n");
        for (int j = 0; j < passage.length; j++) {
          // Thread.sleep(2000);
          if (passage[j].equals("")) continue;

          doc =
              Jsoup.connect(
                      "http://wikipedia-miner.cms.waikato.ac.nz/services/wikify?source="
                          + URLEncoder.encode(passage[j]).replaceAll("\\+", "%20")
                          + option)
                  .timeout(500000)
                  .get();
          links = doc.getElementsByTag("detectedtopic");
          for (Element e : links) {
            String title = e.attr("title");
            String weight = e.attr("weight");
            System.out.println(title + " " + weight);
            bw.write(title + " " + weight);
            bw.newLine();
          }
          // Connection conn =
          // Jsoup.connect("http://wikipedia-miner.cms.waikato.ac.nz/services/wikify?source="+URLEncoder.encode(passage[j]).replaceAll("\\+",
          // "%20")+option);
          System.out.println(j);
        }

        bw.close();
        out.close();
      }

    } catch (Exception e) {
      System.out.println("Error:" + i + " " + e.toString() + "\n" + e.fillInStackTrace());
    } finally {

    }
  }
 private Element cleanupElement(Element el) {
   Tag newTag = null;
   String newText = null;
   if (el.nodeName().equals("img")) {
     newTag = Tag.valueOf("x");
     newText = el.attr("src");
   }
   if (el.nodeName().equals("em")) {
     newTag = Tag.valueOf("b");
   }
   if (el.nodeName().equals("a")) {
     String clazz = el.attr("class");
     if (clazz.equals("user")) {
       newTag = Tag.valueOf("x");
       newText = "@" + el.text().trim();
     } else if (clazz.startsWith("postimg video")) {
       newTag = Tag.valueOf("x");
       newText = "VIDEO: " + el.attr("href") + " THUMBNAIL: " + el.select("img").attr("src");
     } else if (clazz.startsWith("postimg")) {
       newTag = Tag.valueOf("x");
     } else if (clazz.equals("post")) {
       newTag = Tag.valueOf("x");
     } else {
       newTag = Tag.valueOf("x");
       newText = el.attr("href");
     }
   }
   if (el.nodeName().equals("div")) {
     newTag = Tag.valueOf("x");
   }
   Element nel;
   if (newTag == null) {
     // el = el;
     nel = new Element(el.tag(), "");
     //            for(List<Node> children = nel.childNodes(); children.size() > 0; children =
     // nel.childNodes()) {
     //                children.get(0).remove();
     //            }
   } else {
     nel = new Element(newTag, "");
   }
   if (newText != null) {
     nel.appendChild(new TextNode(newText, ""));
   } else {
     List<Node> children = el.childNodes();
     for (Node child : children) {
       if (child instanceof Element) {
         nel.appendChild(cleanupElement((Element) child));
       } else {
         nel.appendChild(new TextNode(child.toString(), ""));
       }
     }
   }
   return nel;
 }
예제 #24
0
  public String reviseImgForHexun(String pcont) {
    if (pcont == null) return "";

    Document doc = Jsoup.parse(pcont);
    Elements eleimages = doc.select("img");
    for (Element img : eleimages) {
      String source = img.attr("original");
      img.attr("src", source);
    }
    return doc.html();
  }
예제 #25
0
  public String reviseImgForSelf(String pcont) {
    if (pcont == null) return "";

    Document doc = Jsoup.parse(pcont);
    Elements eleimages = doc.select("img");
    for (Element img : eleimages) {
      String source = img.attr("data-src");
      if (!source.equals("")) img.attr("src", source);
    }
    return doc.html();
  }
예제 #26
0
  private String scrapeImg() {
    Elements images = doc.getElementsByTag("img");

    for (Element image : images) {
      if (image.attr("alt").contains(title)
          && image.attr("alt").contains(year)
          && image.attr("alt").contains("Poster")) return image.attr("src");
    }

    return null;
  }
예제 #27
0
  public Iterable<Resource> extractResources(String sourceUrl, String html) {

    Set<Resource> resources = Sets.newHashSet();
    String prefixForInternalLinks = URLHandler.createPrefixForInternalLinks(sourceUrl);

    List<Element> elements = new ArrayList<Element>();

    Document doc = Jsoup.parse(html);
    Elements scripts = doc.select("script");

    elements.addAll(doc.select("iframe[src]"));
    elements.addAll(doc.select("link[href]"));
    elements.addAll(doc.select("img[src]"));
    elements.addAll(scripts);

    String uri;

    for (Element element : elements) {
      uri = element.attr("src").trim();
      if (!uri.contains(".")) {
        uri = element.attr("href").trim();
      }

      if (uri.contains(".")) {
        uri = URLHandler.expandIfInternalLink(prefixForInternalLinks, uri);
        try {
          uri = URLHandler.extractHost(uri);
          resources.add(new Resource(uri, type(element.tag().toString())));
        } catch (MalformedURLException e) {
          if (LOG.isWarnEnabled()) {
            LOG.warn("Malformed URL: \"" + uri + "\"");
          }
        }
      }
    }

    List<String> javaScriptUrlCandidates = new ArrayList<String>();
    for (Element script : scripts) {
      try {
        String scriptContents = script.data();
        if (scriptContents.length() > 1) {
          ParserRunner.ParseResult parseResult = javascriptParser.parse(scriptContents);
          findUrlCandidates(parseResult.ast, javaScriptUrlCandidates);
        }
      } catch (Exception e) {
      }
    }

    List<String> splittedUrlCandidates = findUrlsInCode(javaScriptUrlCandidates);

    resources.addAll(resourcesFromCandidates(splittedUrlCandidates));

    return resources;
  }
예제 #28
0
 private URL getGalleryFromImage(URL url) throws IOException {
   Document doc = Http.url(url).get();
   for (Element link : doc.select("a[href~=^gallery\\.php.*$]")) {
     logger.info("LINK: " + link.toString());
     if (link.hasAttr("href") && link.attr("href").contains("gallery.php")) {
       url = new URL("http://imagearn.com/" + link.attr("href"));
       logger.info("[!] Found gallery from given link: " + url);
       return url;
     }
   }
   throw new IOException("Failed to find gallery at URL " + url);
 }
예제 #29
0
  // reads bytes first into a buffer, then decodes with the appropriate charset. done this way to
  // support
  // switching the chartset midstream when a meta http-equiv tag defines the charset.
  // todo - this is getting gnarly. needs a rewrite.
  static Document parseByteData(
      ByteBuffer byteData, String charsetName, String baseUri, Parser parser) {
    String docData;
    Document doc = null;

    // look for BOM - overrides any other header or input
    charsetName = detectCharsetFromBom(byteData, charsetName);

    if (charsetName == null) { // determine from meta. safe first parse as UTF-8
      // look for <meta http-equiv="Content-Type" content="text/html;charset=gb2312"> or HTML5 <meta
      // charset="gb2312">
      docData = Charset.forName(defaultCharset).decode(byteData).toString();
      doc = parser.parseInput(docData, baseUri);
      Element meta = doc.select("meta[http-equiv=content-type], meta[charset]").first();
      String foundCharset = null; // if not found, will keep utf-8 as best attempt
      if (meta != null) {
        if (meta.hasAttr("http-equiv")) {
          foundCharset = getCharsetFromContentType(meta.attr("content"));
        }
        if (foundCharset == null && meta.hasAttr("charset")) {
          foundCharset = meta.attr("charset");
        }
      }
      // look for <?xml encoding='ISO-8859-1'?>
      if (foundCharset == null
          && doc.childNodeSize() > 0
          && doc.childNode(0) instanceof XmlDeclaration) {
        XmlDeclaration prolog = (XmlDeclaration) doc.childNode(0);
        if (prolog.name().equals("xml")) {
          foundCharset = prolog.attr("encoding");
        }
      }
      foundCharset = validateCharset(foundCharset);

      if (foundCharset != null && !foundCharset.equals(defaultCharset)) { // need to re-decode
        foundCharset = foundCharset.trim().replaceAll("[\"']", "");
        charsetName = foundCharset;
        byteData.rewind();
        docData = Charset.forName(foundCharset).decode(byteData).toString();
        doc = null;
      }
    } else { // specified by content type header (or by user on file load)
      Validate.notEmpty(
          charsetName,
          "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML");
      docData = Charset.forName(charsetName).decode(byteData).toString();
    }
    if (doc == null) {
      doc = parser.parseInput(docData, baseUri);
      doc.outputSettings().charset(charsetName);
    }
    return doc;
  }
예제 #30
0
  @Override
  public void process(ResultItems page) {
    Document doc = (Document) page.getResource();

    Elements elements = doc.select("div.txt-list-category-v2");
    for (Element item : elements) {
      String ancestorName = item.select("h3").text();
      String ancestorId = item.attr("id");
      CategoryEntity ancestor =
          new CategoryEntity().setName(ancestorName).setSite(SiteName.Taobao).setCode(ancestorId);
      getLogger().trace(ancestor);
      page.addItem(ancestor);

      Elements subElements = item.select("a");
      CategoryEntity parent = null;
      for (Element item3rd : subElements) {
        if (item3rd.attr("href").isEmpty()) {
          String name = item3rd.text().trim();
          if (name.isEmpty()) {
            continue;
          }
          if (name.toCharArray()[0] == 160) {
            continue;
          }
          parent = new CategoryEntity().setName(name).setSite(SiteName.Taobao).setParent(ancestor);
          getLogger().trace(parent);
          page.addItem(parent);
        } else {
          String url = item3rd.absUrl("href");
          try {
            url = java.net.URLDecoder.decode(url, "utf-8");
          } catch (UnsupportedEncodingException e) {
            throw new RuntimeException(url, e);
          }
          String name = item3rd.text().trim();
          if (name.isEmpty()) {
            continue;
          }
          CategoryEntity grand =
              new CategoryEntity()
                  .setName(name)
                  .setUrl(url)
                  .setSite(SiteName.Taobao)
                  .setParent(parent);
          if (parent == null) {
            throw new RuntimeException("no parent of " + grand);
          }
          getLogger().trace(grand);
          page.addItem(grand);
        }
      }
    }
  }