/** Adds the given URL to the given sitemap while showing the relevant logs */
  private void addUrlIntoSitemap(
      String urlStr,
      SiteMap siteMap,
      String lastMod,
      String changeFreq,
      String priority,
      int urlIndex) {
    try {
      URL url = new URL(urlStr); // Checking the URL
      boolean valid = urlIsValid(siteMap.getBaseUrl(), url.toString());

      if (valid || !strict) {
        SiteMapURL sUrl = new SiteMapURL(url.toString(), lastMod, changeFreq, priority, valid);
        siteMap.addSiteMapUrl(sUrl);
        LOG.debug("  {}. {}", (urlIndex + 1), sUrl);
      } else {
        LOG.warn(
            "URL: {} is excluded from the sitemap as it is not a valid url = not under the base url: {}",
            url.toExternalForm(),
            siteMap.getBaseUrl());
      }
    } catch (MalformedURLException e) {
      LOG.warn("Bad url: [{}]", urlStr);
      LOG.trace("Can't create a sitemap entry with a bad URL", e);
    }
  }
  /**
   * Parse XML that contains a valid Sitemap. Example of a Sitemap: <?xml version="1.0"
   * encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url>
   * <loc>http://www.example.com/</loc> <lastmod>2005-01-01</lastmod>
   * <changefreq>monthly</changefreq> <priority>0.8</priority> </url> <url> <loc
   * >http://www.example.com/catalog?item=12&amp;desc=vacation_hawaii</loc>
   * <changefreq>weekly</changefreq> </url> </urlset>
   *
   * @param doc
   */
  private SiteMap parseXmlSitemap(URL sitemapUrl, Document doc) {

    SiteMap sitemap = new SiteMap(sitemapUrl);
    sitemap.setType(SitemapType.XML);

    NodeList list = doc.getElementsByTagName("url");

    // Loop through the <url>s
    for (int i = 0; i < list.getLength(); i++) {

      Node n = list.item(i);
      if (n.getNodeType() == Node.ELEMENT_NODE) {
        Element elem = (Element) n;
        String lastMod = getElementValue(elem, "lastmod");
        String changeFreq = getElementValue(elem, "changefreq");
        String priority = getElementValue(elem, "priority");
        String loc = getElementValue(elem, "loc");

        addUrlIntoSitemap(loc, sitemap, lastMod, changeFreq, priority, i);
      }
    }

    sitemap.setProcessed(true);
    return sitemap;
  }
  /**
   * Parse the XML document, looking for a <b>feed</b> element to determine if it's an <b>Atom
   * doc</b> <b>rss</b> to determine if it's an <b>RSS doc</b>.
   *
   * @param sitemapUrl
   * @param doc - XML document to parse
   * @throws UnknownFormatException if XML does not appear to be Atom or RSS
   */
  private SiteMap parseSyndicationFormat(URL sitemapUrl, Document doc)
      throws UnknownFormatException {

    SiteMap sitemap = new SiteMap(sitemapUrl);

    // See if this is an Atom feed by looking for "feed" element
    NodeList list = doc.getElementsByTagName("feed");
    if (list.getLength() > 0) {
      parseAtom(sitemap, (Element) list.item(0), doc);
      sitemap.setProcessed(true);
      return sitemap;
    } else {
      // See if it is a RSS feed by looking for a "channel" element. This avoids the issue
      // of having the outer tag named <rdf:RDF> that was causing this code to fail. Inside of
      // the <rss> or <rdf> tag is a <channel> tag, so we can use that.
      // See https://github.com/crawler-commons/crawler-commons/issues/87
      // and also RSS 1.0 specification http://web.resource.org/rss/1.0/spec
      list = doc.getElementsByTagName("channel");
      if (list.getLength() > 0) {
        parseRSS(sitemap, doc);
        sitemap.setProcessed(true);
        return sitemap;
      } else {
        throw new UnknownFormatException("Unknown syndication format at " + sitemapUrl);
      }
    }
  }
  /**
   * Parse XML document which is assumed to be in RSS format. RSS 2.0 example:
   *
   * <p><?xml version="1.0"?> <rss version="2.0"> <channel> <title>Lift Off News</title>
   * <link>http://liftoff.msfc.nasa.gov/</link> <description>Liftoff to Space
   * Exploration.</description> <language>en-us</language> <pubDate>Tue, 10 Jun 2003 04:00:00
   * GMT</pubDate> <lastBuildDate>Tue, 10 Jun 2003 09:41:01 GMT</lastBuildDate>
   * <docs>http://blogs.law.harvard.edu/tech/rss</docs> <generator>Weblog Editor 2.0</generator>
   * <managingEditor>[email protected]</managingEditor>
   * <webMaster>[email protected]</webMaster> <ttl>5</ttl>
   *
   * <p><item> <title>Star City</title>
   * <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link> <description>How do
   * Americans get ready to work with Russians aboard the International Space Station? They take a
   * crash course in culture, language and protocol at Russia's Star City.</description>
   * <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
   * <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid> </item>
   *
   * <p><item> <title>Space Exploration</title> <link>http://liftoff.msfc.nasa.gov/</link>
   * <description>Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a
   * partial eclipse of the Sun on Saturday, May 31.</description> <pubDate>Fri, 30 May 2003
   * 11:06:42 GMT</pubDate> <guid>http://liftoff.msfc.nasa.gov/2003/05/30.html#item572</guid>
   * </item>
   *
   * <p></channel> </rss>
   *
   * @param sitemap
   * @param doc
   */
  private void parseRSS(SiteMap sitemap, Document doc) {

    // Grab items from <item><link>URL</link></item>
    // and last modified date from <pubDate>DATE</pubDate>

    LOG.debug("Parsing RSS doc");
    sitemap.setType(SitemapType.RSS);
    NodeList list = doc.getElementsByTagName("channel");
    Element elem = (Element) list.item(0);

    // Treat publication date as last mod (Tue, 10 Jun 2003 04:00:00 GMT)
    String lastMod = getElementValue(elem, "pubDate");
    LOG.debug("lastMod = ", lastMod);

    list = doc.getElementsByTagName("item");
    // Loop through the <item>s
    for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) {

      Node n = list.item(i);
      if (n.getNodeType() == Node.ELEMENT_NODE) {
        elem = (Element) n;
        String link = getElementValue(elem, "link");

        addUrlIntoSitemap(link, sitemap, lastMod, null, null, i);
      }
    }
  }
  /**
   * Parse the XML document which is assumed to be in Atom format. Atom 1.0 example:
   *
   * <p><?xml version="1.0" encoding="utf-8"?> <feed xmlns="http://www.w3.org/2005/Atom">
   *
   * <p><title>Example Feed</title> <subtitle>A subtitle.</subtitle> <link
   * href="http://example.org/feed/" rel="self"/> <link href="http://example.org/"/>
   * <modified>2003-12-13T18:30:02Z</modified> <author> <name>John Doe</name>
   * <email>[email protected]</email> </author>
   * <id>urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6</id>
   *
   * <p><entry> <title>Atom-Powered Robots Run Amok</title> <link
   * href="http://example.org/2003/12/13/atom03"/>
   * <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id> <updated>2003-12-13T18:30:02Z</updated>
   * <summary>Some text.</summary> </entry>
   *
   * <p></feed>
   *
   * @param elem
   * @param doc
   */
  private void parseAtom(SiteMap sitemap, Element elem, Document doc) {

    // Grab items from <feed><entry><link href="URL" /></entry></feed>
    // Use lastmod date from <feed><modified>DATE</modified></feed>

    LOG.debug("Parsing Atom XML");

    sitemap.setType(SitemapType.ATOM);

    String lastMod = getElementValue(elem, "modified");
    LOG.debug("lastMod = {}", lastMod);

    NodeList list = doc.getElementsByTagName("entry");

    // Loop through the <entry>s
    for (int i = 0; i < list.getLength() && i < MAX_URLS; i++) {

      Node n = list.item(i);
      if (n.getNodeType() == Node.ELEMENT_NODE) {
        elem = (Element) n;
        String href = getElementAttributeValue(elem, "link", "href");

        addUrlIntoSitemap(href, sitemap, lastMod, null, null, i);
      }
    }
  }
示例#6
0
  public void open(String fileName) throws SQLException, SAXException, IOException, Exception {

    readAndParseFile(fileName);
    model.getDb().close(false);
    model.getDb().open(fileName);
    this.fileName = fileName;

    SiteNode newRoot = new SiteNode("Sites");
    siteTree.setRoot(newRoot);

    // update history reference
    List<Integer> list =
        model
            .getDb()
            .getTableHistory()
            .getHistoryList(getSessionId(), HistoryReference.TYPE_MANUAL);
    HistoryReference historyRef = null;

    for (int i = 0; i < list.size(); i++) {
      int historyId = ((Integer) list.get(i)).intValue();

      try {
        historyRef = new HistoryReference(historyId);
        getSiteTree().addPath(historyRef);

        if (i % 100 == 99) Thread.yield();
      } catch (Exception e) {
        // ZAP: Log exceptions
        log.warn(e.getMessage(), e);
      }
    }

    // update siteTree reference
    list =
        model
            .getDb()
            .getTableHistory()
            .getHistoryList(getSessionId(), HistoryReference.TYPE_SPIDER);

    for (int i = 0; i < list.size(); i++) {
      int historyId = ((Integer) list.get(i)).intValue();

      try {
        historyRef = new HistoryReference(historyId);
        getSiteTree().addPath(historyRef);

        if (i % 100 == 99) {
          Thread.yield();
        }

      } catch (Exception e) {
      }
    }

    System.gc();
  }
  @Override
  public String toString() {
    StringBuilder sb = new StringBuilder();
    sb.append("url = \"").append(url).append("\"");
    sb.append(", lastMod = ")
        .append((lastModified == null) ? "null" : SiteMap.getFullDateFormat().format(lastModified));
    sb.append(", changeFreq = ").append(changeFreq);
    sb.append(", priority = ").append(priority);

    return sb.toString();
  }
  /**
   * Process a text-based Sitemap. Text sitemaps only list URLs but no priorities, last mods, etc.
   *
   * @param content
   * @throws IOException
   */
  private SiteMap processText(String sitemapUrl, byte[] content) throws IOException {
    LOG.debug("Processing textual Sitemap");

    SiteMap textSiteMap = new SiteMap(sitemapUrl);
    textSiteMap.setType(SitemapType.TEXT);

    BOMInputStream bomIs = new BOMInputStream(new ByteArrayInputStream(content));
    @SuppressWarnings("resource")
    BufferedReader reader = new BufferedReader(new InputStreamReader(bomIs, "UTF-8"));

    String line;
    int i = 1;
    while ((line = reader.readLine()) != null) {
      if (line.length() > 0 && i <= MAX_URLS) {
        addUrlIntoSitemap(line, textSiteMap, null, null, null, i++);
      }
    }
    textSiteMap.setProcessed(true);

    return textSiteMap;
  }
示例#9
0
  /**
   * Constructor for the current session. The current system time will be used as the session ID.
   *
   * @param sessionId
   */
  public Session(Model model) {
    super(ROOT);

    // add session variable here
    setSessionId(System.currentTimeMillis());
    setSessionName("Untitled Session");
    setSessionDesc("");

    // create default object
    this.siteTree = SiteMap.createTree(model);
    this.model = model;
  }
示例#10
0
  /**
   * Synchronous call to save a session.
   *
   * @param fileName
   * @throws Exception
   */
  public void save(String fileName) throws Exception {
    saveFile(fileName);
    if (isNewState()) {
      model.moveSessionDb(fileName);
    } else {
      if (!this.fileName.equals(fileName)) {
        // copy file to new fileName
        model.copySessionDb(this.fileName, fileName);
      }
    }
    this.fileName = fileName;

    synchronized (siteTree) {
      saveSiteTree((SiteNode) siteTree.getRoot());
    }

    model.getDb().getTableSession().update(getSessionId(), getSessionName());
  }
示例#11
0
  /**
   * Generate the sitemap files.
   *
   * @throws IOException if the files could not be created.
   * @throws SAXException if a xml error occurs.
   */
  public void generate() throws IOException, SAXException {

    int totalCount = 0;

    AttributesImpl schemaLocation = new AttributesImpl();

    transformerHandler.startDocument();
    transformerHandler.startPrefixMapping("xsd", XMLConstants.W3C_XML_SCHEMA_NS_URI);
    transformerHandler.startPrefixMapping("xsi", XMLConstants.W3C_XML_SCHEMA_INSTANCE_NS_URI);

    schemaLocation.addAttribute(
        XMLConstants.W3C_XML_SCHEMA_NS_URI,
        "schemaLocation",
        "xsi:schemaLocation",
        "CDATA",
        "http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd");

    transformerHandler.startElement(NS, "", "sitemapindex", schemaLocation);

    for (final PageProvider provider : providers) {

      LOG.info("Processing " + provider.getName());

      final SiteMap group = new SiteMap(provider.getName());

      try {
        for (final Page page : provider) {
          if (null != page) {
            group.addPage(page);
          }
        }
      } finally {
        group.finish();
        LOG.info(group.getCount() + " entries processed for " + provider.getName());
        totalCount += group.getCount();
      }

      for (final SiteMap.SiteMapFile map : group.getSiteMaps()) {
        transformerHandler.startElement("", "", "sitemap", new AttributesImpl());
        addElement("loc", uri.resolve(map.getFileName()).toString());
        addElement("lastmod", formatDateW3c((new Date())));
        transformerHandler.endElement("", "", "sitemap");
      }
    }

    transformerHandler.endElement(NS, "", "sitemapindex");
    transformerHandler.endDocument();
    writer.close();

    LOG.info("All done (" + totalCount + " entries)");
  }
示例#12
0
  /**
   * Parse XML that contains a Sitemap Index. Example Sitemap Index:
   *
   * <p><?xml version="1.0" encoding="UTF-8"?> <sitemapindex
   * xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap>
   * <loc>http://www.example.com/sitemap1.xml.gz</loc> <lastmod>2004-10-01T18:23:17+00:00</lastmod>
   * </sitemap> <sitemap> <loc>http://www.example.com/sitemap2.xml.gz</loc>
   * <lastmod>2005-01-01</lastmod> </sitemap> </sitemapindex>
   *
   * @param url - URL of Sitemap Index
   * @param nodeList
   */
  private SiteMapIndex parseSitemapIndex(URL url, NodeList nodeList) {

    LOG.debug("Parsing Sitemap Index");

    SiteMapIndex sitemapIndex = new SiteMapIndex(url);
    sitemapIndex.setType(SitemapType.INDEX);

    // Loop through the <sitemap>s
    for (int i = 0; i < nodeList.getLength() && i < MAX_URLS; i++) {

      Node firstNode = nodeList.item(i);

      if (firstNode.getNodeType() == Node.ELEMENT_NODE) {
        Element elem = (Element) firstNode;
        String loc = getElementValue(elem, "loc");

        // try the text content when no loc element
        // has been specified
        if (loc == null) {
          loc = elem.getTextContent().trim();
        }

        try {
          URL sitemapUrl = new URL(loc);
          String lastmod = getElementValue(elem, "lastmod");
          Date lastModified = SiteMap.convertToDate(lastmod);

          // Right now we are not worried about sitemapUrls that point
          // to different websites.

          SiteMap s = new SiteMap(sitemapUrl, lastModified);
          sitemapIndex.addSitemap(s);
          LOG.debug("  {}. {}", (i + 1), s);
        } catch (MalformedURLException e) {
          LOG.trace("Don't create an entry with a bad URL", e);
          LOG.debug("Bad url: [{}]", loc);
        }
      }
    }
    sitemapIndex.setProcessed(true);
    return sitemapIndex;
  }
示例#13
0
 /**
  * Set when this URL was last modified.
  *
  * @param lastModified
  */
 public void setLastModified(String lastModified) {
   this.lastModified = SiteMap.convertToDate(lastModified);
 }