/** @param text */ public static String convert(final String text) { final HtmlParser parser = new HtmlParser(); try { parser.parse(new StringReader(text)); } catch (final IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return parser.getText(); }
/** * Parse HTML file and extract the relevant content, and send it to be downloaded. */ @Override public void run() { Log.d(String.format("AnalyzerTask is running on %s", currUrl)); HashMap<String, ArrayList<String>> exts = new HashMap<>(); // Put the extensions from config.ini in a hash-map, for passing it to the HtmlParser. exts.put("imageExtensions", WebCrawler.imageExtensions); exts.put("videoExtensions", WebCrawler.videoExtensions); exts.put("documentExtensions", WebCrawler.documentExtensions); // Extracting the relevant URLs from the given HTML. HtmlParser parser = new HtmlParser(currUrl, currHtml, exts); parser.parse(); // Filling the URL lists with downloadable data ArrayList<String> imgUrls = parser.getImagesUrls(); ArrayList<String> videoUrls = parser.getVideosUrls(); ArrayList<String> docUrls = parser.getDocsUrls(); ArrayList<String> hrefUrls = parser.getHrefUrls(); Log.d("Sending images to downloads"); sendToDownload(imgUrls, DownloaderTask.RESOURCE_TYPE_IMG); Log.d("Sending videos to downloads"); sendToDownload(videoUrls, DownloaderTask.RESOURCE_TYPE_VIDEO); Log.d("Sending documents to downloads"); sendToDownload(docUrls, DownloaderTask.RESOURCE_TYPE_DOC); Log.d("Sending HREFs to downloads"); sendToDownload(hrefUrls, DownloaderTask.RESOURCE_TYPE_HREF); decreaseNumOfAnalyzersAlive(); }
/** * Parse using given file * * @param file * @throws Exception */ public HTML(File file) throws Exception { // Initialize the variables initialize(); // Set the reader to use a file reader = new FileReader(file); // Initialize the parser parser = new HtmlParser(uacontext, doc); // Use the Cobra HTML parser parser.parse(reader); }
/** * Parse the current web page from the WebDriver * * @param driver * @throws Exception */ public HTML(WebDriver driver) throws Exception { // Initialize the variables initialize(); // Set the reader to use a string which comes from the current web page reader = new StringReader(driver.getPageSource()); // Initialize the parser parser = new HtmlParser(uacontext, doc); // Use the Cobra HTML parser parser.parse(reader); }
/** * Parse using given text * * @param sText * @throws Exception */ public HTML(String sText) throws Exception { // Initialize the variables initialize(); // Set the reader to use a string reader = new StringReader(sText); // Initialize the parser parser = new HtmlParser(uacontext, doc); // Use the Cobra HTML parser parser.parse(reader); }
/** * Method main * * @param args * @throws Exception */ public static void main(String[] args) throws Exception { OpenUrl ou = new OpenUrl(); File f = new File("movie"); File a = new File("a2d.txt"); BufferedWriter wr1 = new BufferedWriter(new FileWriter(a)); System.out.println(f.list().length); for (int i = 0; i < f.list().length; i++) { File f1 = new File(f + File.separator + f.list()[i]); System.out.println(f.list()[i]); String content = ou.getContent(f1); // System.out.println(content); HtmlParser tp = new HtmlParser(); tp.doParser(content); System.out.println(tp.author); wr1.write(tp.author + ";" + f.list()[i]); wr1.newLine(); } wr1.close(); }
public String GetOnePage(String url, String encode) { return HtmlParser.getHtmlContent(url, encode); }