/** @param text */
 public static String convert(final String text) {
   final HtmlParser parser = new HtmlParser();
   try {
     parser.parse(new StringReader(text));
   } catch (final IOException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
   }
   return parser.getText();
 }
Exemplo n.º 2
0
  /** * Parse HTML file and extract the relevant content, and send it to be downloaded. */
  @Override
  public void run() {
    Log.d(String.format("AnalyzerTask is running on %s", currUrl));
    HashMap<String, ArrayList<String>> exts = new HashMap<>();

    // Put the extensions from config.ini in a hash-map, for passing it to the HtmlParser.
    exts.put("imageExtensions", WebCrawler.imageExtensions);
    exts.put("videoExtensions", WebCrawler.videoExtensions);
    exts.put("documentExtensions", WebCrawler.documentExtensions);

    // Extracting the relevant URLs from the given HTML.
    HtmlParser parser = new HtmlParser(currUrl, currHtml, exts);
    parser.parse();

    // Filling the URL lists with downloadable data
    ArrayList<String> imgUrls = parser.getImagesUrls();
    ArrayList<String> videoUrls = parser.getVideosUrls();
    ArrayList<String> docUrls = parser.getDocsUrls();
    ArrayList<String> hrefUrls = parser.getHrefUrls();

    Log.d("Sending images to downloads");
    sendToDownload(imgUrls, DownloaderTask.RESOURCE_TYPE_IMG);
    Log.d("Sending videos to downloads");
    sendToDownload(videoUrls, DownloaderTask.RESOURCE_TYPE_VIDEO);
    Log.d("Sending documents to downloads");
    sendToDownload(docUrls, DownloaderTask.RESOURCE_TYPE_DOC);
    Log.d("Sending HREFs to downloads");
    sendToDownload(hrefUrls, DownloaderTask.RESOURCE_TYPE_HREF);

    decreaseNumOfAnalyzersAlive();
  }
Exemplo n.º 3
0
  /**
   * Parse using given file
   *
   * @param file
   * @throws Exception
   */
  public HTML(File file) throws Exception {
    // Initialize the variables
    initialize();

    // Set the reader to use a file
    reader = new FileReader(file);

    // Initialize the parser
    parser = new HtmlParser(uacontext, doc);

    // Use the Cobra HTML parser
    parser.parse(reader);
  }
Exemplo n.º 4
0
  /**
   * Parse the current web page from the WebDriver
   *
   * @param driver
   * @throws Exception
   */
  public HTML(WebDriver driver) throws Exception {
    // Initialize the variables
    initialize();

    // Set the reader to use a string which comes from the current web page
    reader = new StringReader(driver.getPageSource());

    // Initialize the parser
    parser = new HtmlParser(uacontext, doc);

    // Use the Cobra HTML parser
    parser.parse(reader);
  }
Exemplo n.º 5
0
  /**
   * Parse using given text
   *
   * @param sText
   * @throws Exception
   */
  public HTML(String sText) throws Exception {
    // Initialize the variables
    initialize();

    // Set the reader to use a string
    reader = new StringReader(sText);

    // Initialize the parser
    parser = new HtmlParser(uacontext, doc);

    // Use the Cobra HTML parser
    parser.parse(reader);
  }
Exemplo n.º 6
0
  /**
   * Method main
   *
   * @param args
   * @throws Exception
   */
  public static void main(String[] args) throws Exception {

    OpenUrl ou = new OpenUrl();
    File f = new File("movie");
    File a = new File("a2d.txt");
    BufferedWriter wr1 = new BufferedWriter(new FileWriter(a));

    System.out.println(f.list().length);

    for (int i = 0; i < f.list().length; i++) {
      File f1 = new File(f + File.separator + f.list()[i]);
      System.out.println(f.list()[i]);
      String content = ou.getContent(f1);
      // System.out.println(content);
      HtmlParser tp = new HtmlParser();
      tp.doParser(content);
      System.out.println(tp.author);
      wr1.write(tp.author + ";" + f.list()[i]);
      wr1.newLine();
    }
    wr1.close();
  }
Exemplo n.º 7
0
 public String GetOnePage(String url, String encode) {
   return HtmlParser.getHtmlContent(url, encode);
 }