/** This function is called when a page is fetched and ready to be processed by your program. */ @Override public void visit(Page page) { // System.out.println(page.getWebURL().toString().contains("city")+" // "+page.getWebURL()); // url title date body if (page.getWebURL().toString().contains("/ads/") || page.getWebURL().toString().contains("/view/")) { if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String html = htmlParseData.getHtml(); Document doc = Jsoup.parse(html); String url; String title; String body = null; String date = null; url = page.getWebURL().getURL(); title = doc.title(); // System.out.println(title); date = doc.getElementsByClass("nameDiv").get(1).text(); body = doc.getElementsByClass("addsTexts").get(0).text(); System.out.println(date); System.out.println(body); // System.out.println(body); // System.out.println("URL: " + url); // System.out.println("title: " + title); // System.out.println("date: " + date); // System.out.println("body: " + body); Indexer.add(url, title, body, date); } } }
/** This function is called when a page is fetched and ready to be processed by your program. */ @Override public void visit(Page page) { if (page.getParseData() instanceof HtmlParseData) { ResultSet res = null; try { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String html = htmlParseData.getHtml(); Document doc = Jsoup.parse(html); // check if the page must be crawled String url = page.getWebURL().getURL(); if (!shouldVisit(url)) { return; } currentPage = getDocumentURL(url.toLowerCase()); // check if the page was already crawled queryDocumentFind.setString(1, currentPage); res = queryDocumentFind.executeQuery(); if (res.next()) { return; } logger.debug("Code: " + currentPage); // insert into DOCUMENT queryDocumentInsert.setString(1, getSHA256(currentPage)); queryDocumentInsert.setString(2, currentPage); queryDocumentInsert.setString(3, getDocumentType(url).name()); queryDocumentInsert.setString(4, getDocumentBand(url, doc)); queryDocumentInsert.setString(5, getDocumentAlbum(url, doc)); queryDocumentInsert.setAsciiStream(6, getDocumentContent(url, doc)); queryDocumentInsert.setDate(7, getDocumentCreationDate(url, doc)); queryDocumentInsert.setString(8, getDocumentCover(url, doc)); queryDocumentInsert.setString(9, getDocumentAuthor(url, doc)); queryDocumentInsert.setString(10, getDocumentGenre(url, doc)); queryDocumentInsert.setInt(11, getDocumentYear(url, doc)); queryDocumentInsert.setString(12, getDocumentLabel(url, doc)); queryDocumentInsert.setFloat(13, getDocumentVote(url, doc)); queryDocumentInsert.setBoolean(14, getDocumentMilestone(url, doc)); queryDocumentInsert.executeUpdate(); logger.info("Document " + currentPage + " added into DOCUMENT"); } catch (Throwable t) { logger.error("Error parsing page " + page.getWebURL().getURL() + ": " + t.getMessage()); } finally { try { if (res != null) { res.close(); } } catch (Throwable t) { // do nothing } } } }
@Override public void visit(Page page) { String url = page.getWebURL().getURL(); // We are only interested in processing images if (!(page.getParseData() instanceof BinaryParseData)) { return; } if (!imgPatterns.matcher(url).matches()) { return; } // Not interested in very small images if (page.getContentData().length < 10 * 1024) { return; } // get a unique name for storing this image String extension = url.substring(url.lastIndexOf(".")); String hashedName = Cryptography.MD5(url) + extension; // store image IO.writeBytesToFile(page.getContentData(), storageFolder.getAbsolutePath() + "/" + hashedName); System.out.println("Stored: " + url); }
/** This function is called when a page is fetched and ready to be processed by your program. */ @Override public void visit(Page page) { String url = page.getWebURL().getURL(); System.out.println("URL: " + url); if (page.getParseData() instanceof HtmlParseData) { i++; HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); Set<WebURL> links = htmlParseData.getOutgoingUrls(); System.out.println("Text: " + text); System.out.println("Text length: " + text.length()); System.out.println("Html length: " + html.length()); System.out.println("Number of outgoing links: " + links.size()); try { PrintWriter writer = new PrintWriter(i + ".txt", "UTF-8"); writer.println("Text length: " + text); writer.println("Html length: " + html); writer.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } } }
@Override public void visit(Page page) { if (page.getWebURL().getURL().equals("http://www.eshetab.com/")) return; else if (filter1.matcher(page.getWebURL().getURL()).matches()) { return; } else { System.out.println(page.getWebURL().getURL()); Document doc = Jsoup.parse(((HtmlParseData) page.getParseData()).getHtml()); Elements elements = doc.select(".addsTexts , .nameDiv:nth-child(3) , .nameDiv:nth-child(2) , .DivTitle span"); String title = elements.get(0).text(); String date = elements.get(2).text().split(":")[1].substring(1); // it has one space in zero index date = date.split("-")[2] + "/" + date.split("-")[1] + "/" + date.split("-")[0]; date = CalendarUtility.getEnglishDate(date); System.out.println(date); String city = "نامعلوم"; // System.out.println(elements.get(3).text()); city = elements.get(3).text().split(":")[1].substring(1).split("-")[0]; if (city.contains(",")) { city = city.substring(1, city.length() - 1); } else { city = "نامعلوم"; } System.out.println(date); String body = elements.get(4).text(); try { feeds.add( new Feed( title, body, city, URLDecoder.decode(page.getWebURL().toString(), "UTF8"), date)); } catch (ParseException e) { e.printStackTrace(); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } } }
@Override public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); System.out.println("Docid: " + docid); // *** Filter urls which need to be processed ************ if (url.startsWith(Mission.crawlDomain + "topic/") && page.getParseData() instanceof HtmlParseData) { // ************************************************* HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); Document dom = Jsoup.parse(htmlParseData.getHtml()); // *** Scratch elements inside a page ************** Elements titles = dom.select("div.title").select("h2"); /* * TODO bug: text will also contain title */ Elements texts = dom.select("div[class=bd art-content clearfix]"); String title = titles.get(0).text(); String text = texts.get(0).text(); // ************************************************* if (titles.size() == 1 && texts.size() == 1) { System.out.println("title:" + title); System.out.println("text:" + text); String crawlClass = Mission.dbNameMap.get("crawlClass"); String[] crawlFields = Mission.dbNameMap.get("crawlFields").split(","); // *** Save to database ************************* ODatabaseRecordThreadLocal.INSTANCE.set(Mission.db); ODocument doc = new ODocument(crawlClass); doc.field(crawlFields[0], url); doc.field(crawlFields[1], title); doc.field(crawlFields[2], text); doc.save(); // ************************************************* } } }
@Override public void visit(Page page) { String url = page.getWebURL().getURL(); System.out.println("Visited: " + url); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); List<WebURL> links = htmlParseData.getOutgoingUrls(); System.out.println("Text length: " + text.length()); System.out.println("Html length: " + html.length()); System.out.println("Number of outgoing links: " + links.size()); } }
public void visit(Page page) { String url = page.getWebURL().getURL(); // standard out contains a single line per URL, with the URL // followed by all the words found on the page // String text = page.getText().replaceAll("[^a-zA-Z]+", " "); System.out.println(url + "\t" + text); // standard err contains a line for each outgoing link from the // page we're crawling // for (WebURL link : page.getURLs()) { System.err.println(url + "\t" + link.getURL()); } }
@Override public void visit(Page page) { // Keep track of visited URLs String url = page.getWebURL().getURL(); stats.addCrawledUrl(url); System.out.println("Crawled: " + url); // Get the page terms and store them locally if (page.getParseData() instanceof HtmlParseData) { // make sure document has HTML data HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String html = htmlParseData.getHtml(); // Don't keep documents that are too big (bigger than 2MB) if (html.length() <= 2097152) { // Store the HTML this.params.getDocumentStorage().storeDocument(url, html); } } }
@Override public void visit(Page page) { String url = page.getWebURL().getURL(); urlQueue.add(new EmailCollectorUrlMessage(url)); logger.info("Scanning URL: " + url); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String html = htmlParseData.getHtml(); try { addEmail(html); } catch (InterruptedException e) { e.printStackTrace(); } } logger.info("============="); }
/** This function is called when a page is fetched and ready to be processed by your program. */ @Override public void visit(Page page) { int docid = page.getWebURL().getDocid(); String url = page.getWebURL().getURL(); String domain = page.getWebURL().getDomain(); String path = page.getWebURL().getPath(); String subDomain = page.getWebURL().getSubDomain(); String parentUrl = page.getWebURL().getParentUrl(); String anchor = page.getWebURL().getAnchor(); System.out.println("Docid: " + docid); System.out.println("URL: " + url); System.out.println("Domain: '" + domain + "'"); System.out.println("Sub-domain: '" + subDomain + "'"); System.out.println("Path: '" + path + "'"); System.out.println("Parent page: " + parentUrl); System.out.println("Anchor text: " + anchor); if (page.getParseData() instanceof HtmlParseData) { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); String text = htmlParseData.getText(); String html = htmlParseData.getHtml(); List<WebURL> links = htmlParseData.getOutgoingUrls(); System.out.println("Text length: " + text.length()); System.out.println("Html length: " + html.length()); System.out.println("Number of outgoing links: " + links.size()); } Header[] responseHeaders = page.getFetchResponseHeaders(); if (responseHeaders != null) { System.out.println("Response headers:"); for (Header header : responseHeaders) { System.out.println("\t" + header.getName() + ": " + header.getValue()); } } System.out.println("============="); }
private void PrintLongestPage() { String url = longestPage.getWebURL().getURL().toLowerCase(); System.out.println("Longest page is :" + url); System.out.println("Number of words in the page :" + longestPageWordCount); }
public ExtractorContext extractContent() { HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); ExtractorContext ctx = new ExtractorContext(page, htmlParseData.getHtml()); ctx.setMetaTags(MetadataExtractor.INST.extractMetaTags(ctx.getDoc())); String language = ctx.getMetaTags().get(MetadataExtractor.CONTENT_LANGUAGE_TAG); // skip not english pages if ((language != null && !ENGLISH_US.equals(language.toLowerCase()))) { LOG.info("Skipping " + ctx.getUrl() + " not ENGLISH"); return null; } if (ctx.getUrl().contains(YOUTUBE)) { String keyWords = ctx.getMetaTags().get(MetadataExtractor.KEYWORDS_TAG); if (keyWords == null) { LOG.info("Skipping " + ctx.getUrl() + " no KEYWORDS"); return null; } keyWords = keyWords.toLowerCase(); if (!keyWords.contains(SAP)) { LOG.info("Skipping " + ctx.getUrl() + " no SAP in KEYWORDS"); return null; } } Document clearedHtml = cleanHtml(ctx); if (clearedHtml.text().length() < MIN_CONTENT_LENGTH) { LOG.info("Skipping " + ctx.getUrl() + " less then " + MIN_CONTENT_LENGTH + " characters"); return null; } // skip pages with not English content if (!LanguageDetector.INST.isEnglish(clearedHtml.text())) { LOG.info("Skipping " + ctx.getUrl() + " not ENGLISH"); return null; } NavigableSet<ExtractedImage> images = MostRelevantImageExtractor.INST.extract(ctx); String uniqueFileName = CryptographyUtils.sha1(clearedHtml.text()); PageObject pageObject = new PageObject(); pageObject.setId(uniqueFileName); pageObject.setTitle(ctx.getTitle()); pageObject.setContent(clearedHtml.html()); pageObject.setUrl(page.getWebURL().getURL()); pageObject.setHtmlCleanerName(ctx.getCleaner().getName()); List<TextBlock> blocks = TextBlocksExtractor.INST.extract(clearedHtml); if (!blocks.isEmpty()) { pageObject.setMainBlock(blocks.get(0)); blocks.remove(0); pageObject.addBlocks(blocks); } // set title from First text block if NULL or EMPTY. if ((pageObject.getTitle() == null || "".equals(pageObject.getTitle().trim())) && !pageObject.getBlocks().isEmpty()) { pageObject.setTitle(pageObject.getBlocks().get(0).getTitle()); } pageObject.addAllKeywords(BaseClassifier.INST.classify(ctx)); if (!images.isEmpty()) { pageObject.addImages(images); } ctx.setPageObject(pageObject); return ctx; }