/** * Parse HTML file and extract the relevant content, and send it to be downloaded. */ @Override public void run() { Log.d(String.format("AnalyzerTask is running on %s", currUrl)); HashMap<String, ArrayList<String>> exts = new HashMap<>(); // Put the extensions from config.ini in a hash-map, for passing it to the HtmlParser. exts.put("imageExtensions", WebCrawler.imageExtensions); exts.put("videoExtensions", WebCrawler.videoExtensions); exts.put("documentExtensions", WebCrawler.documentExtensions); // Extracting the relevant URLs from the given HTML. HtmlParser parser = new HtmlParser(currUrl, currHtml, exts); parser.parse(); // Filling the URL lists with downloadable data ArrayList<String> imgUrls = parser.getImagesUrls(); ArrayList<String> videoUrls = parser.getVideosUrls(); ArrayList<String> docUrls = parser.getDocsUrls(); ArrayList<String> hrefUrls = parser.getHrefUrls(); Log.d("Sending images to downloads"); sendToDownload(imgUrls, DownloaderTask.RESOURCE_TYPE_IMG); Log.d("Sending videos to downloads"); sendToDownload(videoUrls, DownloaderTask.RESOURCE_TYPE_VIDEO); Log.d("Sending documents to downloads"); sendToDownload(docUrls, DownloaderTask.RESOURCE_TYPE_DOC); Log.d("Sending HREFs to downloads"); sendToDownload(hrefUrls, DownloaderTask.RESOURCE_TYPE_HREF); decreaseNumOfAnalyzersAlive(); }
/** @param text */ public static String convert(final String text) { final HtmlParser parser = new HtmlParser(); try { parser.parse(new StringReader(text)); } catch (final IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return parser.getText(); }
/** * Parse using given file * * @param file * @throws Exception */ public HTML(File file) throws Exception { // Initialize the variables initialize(); // Set the reader to use a file reader = new FileReader(file); // Initialize the parser parser = new HtmlParser(uacontext, doc); // Use the Cobra HTML parser parser.parse(reader); }
/** * Parse the current web page from the WebDriver * * @param driver * @throws Exception */ public HTML(WebDriver driver) throws Exception { // Initialize the variables initialize(); // Set the reader to use a string which comes from the current web page reader = new StringReader(driver.getPageSource()); // Initialize the parser parser = new HtmlParser(uacontext, doc); // Use the Cobra HTML parser parser.parse(reader); }
/** * Parse using given text * * @param sText * @throws Exception */ public HTML(String sText) throws Exception { // Initialize the variables initialize(); // Set the reader to use a string reader = new StringReader(sText); // Initialize the parser parser = new HtmlParser(uacontext, doc); // Use the Cobra HTML parser parser.parse(reader); }