Пример #1
0
  private static int importFromBookmarks(
      BookmarksDB db,
      final DigestURI baseURL,
      final InputStreamReader input,
      final String tag,
      final boolean importPublic) {

    int importCount = 0;

    Map<MultiProtocolURI, Properties> links = new HashMap<MultiProtocolURI, Properties>();
    String title;
    MultiProtocolURI url;
    Bookmark bm;
    final Set<String> tags = ListManager.string2set(tag); // this allow multiple default tags
    try {
      // load the links
      final ContentScraper scraper = new ContentScraper(baseURL);
      // OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
      final Writer writer = new TransformerWriter(null, null, scraper, null, false);
      FileUtils.copy(input, writer);
      writer.close();
      links = scraper.getAnchors();
    } catch (final IOException e) {
      Log.logWarning(
          "BOOKMARKS", "error during load of links: " + e.getClass() + " " + e.getMessage());
    }
    for (final Entry<MultiProtocolURI, Properties> link : links.entrySet()) {
      url = link.getKey();
      title = link.getValue().getProperty("name", "");
      Log.logInfo("BOOKMARKS", "links.get(url)");
      if ("".equals(title)) { // cannot be displayed
        title = url.toString();
      }
      bm = db.new Bookmark(url.toString());
      bm.setProperty(Bookmark.BOOKMARK_TITLE, title);
      bm.setTags(tags);
      bm.setPublic(importPublic);
      db.saveBookmark(bm);

      importCount++;
    }

    return importCount;
  }
Пример #2
0
  /**
   * returns the amount of disk space available
   *
   * @return
   *     <ul>
   *       <li><code>HIGH</code> if disk space is available
   *       <li><code>MEDIUM</code> if low disk space is available
   *       <li><code>LOW</code> if lower than hardlimit disk space is available
   *     </ul>
   */
  private Space getNormalizedDiskFree() {
    final long currentSpace = getUsableSpace(this.path);
    if (currentSpace < 1L) return Space.HIGH;
    Space ret = Space.HIGH;

    if (currentSpace < getMinFreeDiskSpace()) {
      log.logWarning(
          "Volume "
              + this.path.toString()
              + ": free space ("
              + (currentSpace / 1024 / 1024)
              + " MB) is too low (< "
              + (getMinFreeDiskSpace() / 1024 / 1024)
              + " MB)");
      ret = Space.MEDIUM;
    }
    if (currentSpace < getMinFreeDiskSpace_hardlimit()) {
      ret = Space.LOW;
    }
    return ret;
  }
Пример #3
0
  public CrawlSwitchboard(final String networkName, final Log log, final File queuesRoot) {

    log.logInfo("Initializing Word Index for the network '" + networkName + "'.");

    if (networkName == null || networkName.length() == 0) {
      log.logSevere("no network name given - shutting down");
      System.exit(0);
    }
    this.log = log;
    this.profilesActiveCrawlsCache =
        Collections.synchronizedMap(new TreeMap<byte[], CrawlProfile>(Base64Order.enhancedCoder));

    // make crawl profiles database and default profiles
    this.queuesRoot = queuesRoot;
    this.queuesRoot.mkdirs();
    this.log.logConfig("Initializing Crawl Profiles");

    final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
    this.profilesActiveCrawls = loadFromDB(profilesActiveFile);
    for (final byte[] handle : this.profilesActiveCrawls.keySet()) {
      CrawlProfile p;
      try {
        p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
      } catch (final IOException e) {
        p = null;
      } catch (final RowSpaceExceededException e) {
        p = null;
      }
      if (p == null) continue;
      if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTMATCH))) {
        removeActive(handle);
        Log.logWarning(
            "CrawlProfiles",
            "removed Profile "
                + p.handle()
                + ": "
                + p.name()
                + " from active crawls since "
                + CrawlProfile.FILTER_URL_MUSTMATCH
                + " is no valid regular expression: "
                + p.get(CrawlProfile.FILTER_URL_MUSTMATCH));
      } else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH))) {
        removeActive(handle);
        Log.logWarning(
            "CrawlProfiles",
            "removed Profile "
                + p.handle()
                + ": "
                + p.name()
                + " from active crawls since "
                + CrawlProfile.FILTER_URL_MUSTNOTMATCH
                + " is no valid regular expression: "
                + p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH));
      } else {
        Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
      }
    }
    initActiveCrawlProfiles();
    log.logInfo(
        "Loaded active crawl profiles from file "
            + profilesActiveFile.getName()
            + ", "
            + this.profilesActiveCrawls.size()
            + " entries");

    final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES);
    this.profilesPassiveCrawls = loadFromDB(profilesPassiveFile);
    for (final byte[] handle : this.profilesPassiveCrawls.keySet()) {
      CrawlProfile p;
      try {
        p = new CrawlProfile(this.profilesPassiveCrawls.get(handle));
        Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
      } catch (final IOException e) {
        continue;
      } catch (final RowSpaceExceededException e) {
        continue;
      }
    }
    log.logInfo(
        "Loaded passive crawl profiles from file "
            + profilesPassiveFile.getName()
            + ", "
            + this.profilesPassiveCrawls.size()
            + " entries"
            + ", "
            + profilesPassiveFile.length() / 1024);
  }