/** Adds the given URL to the given sitemap while showing the relevant logs */ private void addUrlIntoSitemap( String urlStr, SiteMap siteMap, String lastMod, String changeFreq, String priority, int urlIndex) { try { URL url = new URL(urlStr); // Checking the URL boolean valid = urlIsValid(siteMap.getBaseUrl(), url.toString()); if (valid || !strict) { SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, changeFreq, priority, valid); siteMap.addSiteMapUrl(sUrl); LOG.debug(" {}. {}", (urlIndex + 1), sUrl); } else { LOG.warn( "URL: {} is excluded from the sitemap as it is not a valid url = not under the base url: {}", url.toExternalForm(), siteMap.getBaseUrl()); } } catch (MalformedURLException e) { LOG.warn("Bad url: [{}]", urlStr); LOG.trace("Can't create a sitemap entry with a bad URL", e); } }
/** * Parse XML that contains a valid Sitemap. Example of a Sitemap: <?xml version="1.0" * encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> * <loc>http://www.example.com/</loc> <lastmod>2005-01-01</lastmod> * <changefreq>monthly</changefreq> <priority>0.8</priority> </url> <url> <loc * >http://www.example.com/catalog?item=12&desc=vacation_hawaii</loc> * <changefreq>weekly</changefreq> </url> </urlset> * * @param doc */ private SiteMap parseXmlSitemap(URL sitemapUrl, Document doc) { SiteMap sitemap = new SiteMap(sitemapUrl); sitemap.setType(SitemapType.XML); NodeList list = doc.getElementsByTagName("url"); // Loop through the <url>s for (int i = 0; i < list.getLength(); i++) { Node n = list.item(i); if (n.getNodeType() == Node.ELEMENT_NODE) { Element elem = (Element) n; String lastMod = getElementValue(elem, "lastmod"); String changeFreq = getElementValue(elem, "changefreq"); String priority = getElementValue(elem, "priority"); String loc = getElementValue(elem, "loc"); addUrlIntoSitemap(loc, sitemap, lastMod, changeFreq, priority, i); } } sitemap.setProcessed(true); return sitemap; }
/** * Parse the XML document, looking for a <b>feed</b> element to determine if it's an <b>Atom * doc</b> <b>rss</b> to determine if it's an <b>RSS doc</b>. * * @param sitemapUrl * @param doc - XML document to parse * @throws UnknownFormatException if XML does not appear to be Atom or RSS */ private SiteMap parseSyndicationFormat(URL sitemapUrl, Document doc) throws UnknownFormatException { SiteMap sitemap = new SiteMap(sitemapUrl); // See if this is an Atom feed by looking for "feed" element NodeList list = doc.getElementsByTagName("feed"); if (list.getLength() > 0) { parseAtom(sitemap, (Element) list.item(0), doc); sitemap.setProcessed(true); return sitemap; } else { // See if it is a RSS feed by looking for a "channel" element. This avoids the issue // of having the outer tag named <rdf:RDF> that was causing this code to fail. Inside of // the <rss> or <rdf> tag is a <channel> tag, so we can use that. // See https://github.com/crawler-commons/crawler-commons/issues/87 // and also RSS 1.0 specification http://web.resource.org/rss/1.0/spec list = doc.getElementsByTagName("channel"); if (list.getLength() > 0) { parseRSS(sitemap, doc); sitemap.setProcessed(true); return sitemap; } else { throw new UnknownFormatException("Unknown syndication format at " + sitemapUrl); } } }
/** * Parse XML document which is assumed to be in RSS format. RSS 2.0 example: * * <p><?xml version="1.0"?> <rss version="2.0"> <channel> <title>Lift Off News</title> * <link>http://liftoff.msfc.nasa.gov/</link> <description>Liftoff to Space * Exploration.</description> <language>en-us</language> <pubDate>Tue, 10 Jun 2003 04:00:00 * GMT</pubDate> <lastBuildDate>Tue, 10 Jun 2003 09:41:01 GMT</lastBuildDate> * <docs>http://blogs.law.harvard.edu/tech/rss</docs> <generator>Weblog Editor 2.0</generator> * <managingEditor>[email protected]</managingEditor> * <webMaster>[email protected]</webMaster> <ttl>5</ttl> * * <p><item> <title>Star City</title> * <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link> <description>How do * Americans get ready to work with Russians aboard the International Space Station? They take a * crash course in culture, language and protocol at Russia's Star City.</description> * <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate> * <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid> </item> * * <p><item> <title>Space Exploration</title> <link>http://liftoff.msfc.nasa.gov/</link> * <description>Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a * partial eclipse of the Sun on Saturday, May 31.</description> <pubDate>Fri, 30 May 2003 * 11:06:42 GMT</pubDate> <guid>http://liftoff.msfc.nasa.gov/2003/05/30.html#item572</guid> * </item> * * <p></channel> </rss> * * @param sitemap * @param doc */ private void parseRSS(SiteMap sitemap, Document doc) { // Grab items from <item><link>URL</link></item> // and last modified date from <pubDate>DATE</pubDate> LOG.debug("Parsing RSS doc"); sitemap.setType(SitemapType.RSS); NodeList list = doc.getElementsByTagName("channel"); Element elem = (Element) list.item(0); // Treat publication date as last mod (Tue, 10 Jun 2003 04:00:00 GMT) String lastMod = getElementValue(elem, "pubDate"); LOG.debug("lastMod = ", lastMod); list = doc.getElementsByTagName("item"); // Loop through the <item>s for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) { Node n = list.item(i); if (n.getNodeType() == Node.ELEMENT_NODE) { elem = (Element) n; String link = getElementValue(elem, "link"); addUrlIntoSitemap(link, sitemap, lastMod, null, null, i); } } }
/** * Parse the XML document which is assumed to be in Atom format. Atom 1.0 example: * * <p><?xml version="1.0" encoding="utf-8"?> <feed xmlns="http://www.w3.org/2005/Atom"> * * <p><title>Example Feed</title> <subtitle>A subtitle.</subtitle> <link * href="http://example.org/feed/" rel="self"/> <link href="http://example.org/"/> * <modified>2003-12-13T18:30:02Z</modified> <author> <name>John Doe</name> * <email>[email protected]</email> </author> * <id>urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6</id> * * <p><entry> <title>Atom-Powered Robots Run Amok</title> <link * href="http://example.org/2003/12/13/atom03"/> * <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> <updated>2003-12-13T18:30:02Z</updated> * <summary>Some text.</summary> </entry> * * <p></feed> * * @param elem * @param doc */ private void parseAtom(SiteMap sitemap, Element elem, Document doc) { // Grab items from <feed><entry><link href="URL" /></entry></feed> // Use lastmod date from <feed><modified>DATE</modified></feed> LOG.debug("Parsing Atom XML"); sitemap.setType(SitemapType.ATOM); String lastMod = getElementValue(elem, "modified"); LOG.debug("lastMod = {}", lastMod); NodeList list = doc.getElementsByTagName("entry"); // Loop through the <entry>s for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) { Node n = list.item(i); if (n.getNodeType() == Node.ELEMENT_NODE) { elem = (Element) n; String href = getElementAttributeValue(elem, "link", "href"); addUrlIntoSitemap(href, sitemap, lastMod, null, null, i); } } }
public void open(String fileName) throws SQLException, SAXException, IOException, Exception { readAndParseFile(fileName); model.getDb().close(false); model.getDb().open(fileName); this.fileName = fileName; SiteNode newRoot = new SiteNode("Sites"); siteTree.setRoot(newRoot); // update history reference List<Integer> list = model .getDb() .getTableHistory() .getHistoryList(getSessionId(), HistoryReference.TYPE_MANUAL); HistoryReference historyRef = null; for (int i = 0; i < list.size(); i++) { int historyId = ((Integer) list.get(i)).intValue(); try { historyRef = new HistoryReference(historyId); getSiteTree().addPath(historyRef); if (i % 100 == 99) Thread.yield(); } catch (Exception e) { // ZAP: Log exceptions log.warn(e.getMessage(), e); } } // update siteTree reference list = model .getDb() .getTableHistory() .getHistoryList(getSessionId(), HistoryReference.TYPE_SPIDER); for (int i = 0; i < list.size(); i++) { int historyId = ((Integer) list.get(i)).intValue(); try { historyRef = new HistoryReference(historyId); getSiteTree().addPath(historyRef); if (i % 100 == 99) { Thread.yield(); } } catch (Exception e) { } } System.gc(); }
@Override public String toString() { StringBuilder sb = new StringBuilder(); sb.append("url = \"").append(url).append("\""); sb.append(", lastMod = ") .append((lastModified == null) ? "null" : SiteMap.getFullDateFormat().format(lastModified)); sb.append(", changeFreq = ").append(changeFreq); sb.append(", priority = ").append(priority); return sb.toString(); }
/** * Process a text-based Sitemap. Text sitemaps only list URLs but no priorities, last mods, etc. * * @param content * @throws IOException */ private SiteMap processText(String sitemapUrl, byte[] content) throws IOException { LOG.debug("Processing textual Sitemap"); SiteMap textSiteMap = new SiteMap(sitemapUrl); textSiteMap.setType(SitemapType.TEXT); BOMInputStream bomIs = new BOMInputStream(new ByteArrayInputStream(content)); @SuppressWarnings("resource") BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs, "UTF-8")); String line; int i = 1; while ((line = reader.readLine()) != null) { if (line.length() > 0 && i <= MAX_URLS) { addUrlIntoSitemap(line, textSiteMap, null, null, null, i++); } } textSiteMap.setProcessed(true); return textSiteMap; }
/** * Constructor for the current session. The current system time will be used as the session ID. * * @param sessionId */ public Session(Model model) { super(ROOT); // add session variable here setSessionId(System.currentTimeMillis()); setSessionName("Untitled Session"); setSessionDesc(""); // create default object this.siteTree = SiteMap.createTree(model); this.model = model; }
/** * Synchronous call to save a session. * * @param fileName * @throws Exception */ public void save(String fileName) throws Exception { saveFile(fileName); if (isNewState()) { model.moveSessionDb(fileName); } else { if (!this.fileName.equals(fileName)) { // copy file to new fileName model.copySessionDb(this.fileName, fileName); } } this.fileName = fileName; synchronized (siteTree) { saveSiteTree((SiteNode) siteTree.getRoot()); } model.getDb().getTableSession().update(getSessionId(), getSessionName()); }
/** * Generate the sitemap files. * * @throws IOException if the files could not be created. * @throws SAXException if a xml error occurs. */ public void generate() throws IOException, SAXException { int totalCount = 0; AttributesImpl schemaLocation = new AttributesImpl(); transformerHandler.startDocument(); transformerHandler.startPrefixMapping("xsd", XMLConstants.W3C_XML_SCHEMA_NS_URI); transformerHandler.startPrefixMapping("xsi", XMLConstants.W3C_XML_SCHEMA_INSTANCE_NS_URI); schemaLocation.addAttribute( XMLConstants.W3C_XML_SCHEMA_NS_URI, "schemaLocation", "xsi:schemaLocation", "CDATA", "http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd"); transformerHandler.startElement(NS, "", "sitemapindex", schemaLocation); for (final PageProvider provider : providers) { LOG.info("Processing " + provider.getName()); final SiteMap group = new SiteMap(provider.getName()); try { for (final Page page : provider) { if (null != page) { group.addPage(page); } } } finally { group.finish(); LOG.info(group.getCount() + " entries processed for " + provider.getName()); totalCount += group.getCount(); } for (final SiteMap.SiteMapFile map : group.getSiteMaps()) { transformerHandler.startElement("", "", "sitemap", new AttributesImpl()); addElement("loc", uri.resolve(map.getFileName()).toString()); addElement("lastmod", formatDateW3c((new Date()))); transformerHandler.endElement("", "", "sitemap"); } } transformerHandler.endElement(NS, "", "sitemapindex"); transformerHandler.endDocument(); writer.close(); LOG.info("All done (" + totalCount + " entries)"); }
/** * Parse XML that contains a Sitemap Index. Example Sitemap Index: * * <p><?xml version="1.0" encoding="UTF-8"?> <sitemapindex * xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> * <loc>http://www.example.com/sitemap1.xml.gz</loc> <lastmod>2004-10-01T18:23:17+00:00</lastmod> * </sitemap> <sitemap> <loc>http://www.example.com/sitemap2.xml.gz</loc> * <lastmod>2005-01-01</lastmod> </sitemap> </sitemapindex> * * @param url - URL of Sitemap Index * @param nodeList */ private SiteMapIndex parseSitemapIndex(URL url, NodeList nodeList) { LOG.debug("Parsing Sitemap Index"); SiteMapIndex sitemapIndex = new SiteMapIndex(url); sitemapIndex.setType(SitemapType.INDEX); // Loop through the <sitemap>s for (int i = 0; i < nodeList.getLength() && i < MAX_URLS; i++) { Node firstNode = nodeList.item(i); if (firstNode.getNodeType() == Node.ELEMENT_NODE) { Element elem = (Element) firstNode; String loc = getElementValue(elem, "loc"); // try the text content when no loc element // has been specified if (loc == null) { loc = elem.getTextContent().trim(); } try { URL sitemapUrl = new URL(loc); String lastmod = getElementValue(elem, "lastmod"); Date lastModified = SiteMap.convertToDate(lastmod); // Right now we are not worried about sitemapUrls that point // to different websites. SiteMap s = new SiteMap(sitemapUrl, lastModified); sitemapIndex.addSitemap(s); LOG.debug(" {}. {}", (i + 1), s); } catch (MalformedURLException e) { LOG.trace("Don't create an entry with a bad URL", e); LOG.debug("Bad url: [{}]", loc); } } } sitemapIndex.setProcessed(true); return sitemapIndex; }
/** * Set when this URL was last modified. * * @param lastModified */ public void setLastModified(String lastModified) { this.lastModified = SiteMap.convertToDate(lastModified); }