Beispiel #1
0
  public boolean clear() throws InterruptedException {
    this.profilesActiveCrawlsCache.clear();
    CrawlProfile entry;
    boolean hasDoneSomething = false;
    try {
      for (final byte[] handle : this.profilesActiveCrawls.keySet()) {
        // check for interruption
        if (Thread.currentThread().isInterrupted())
          throw new InterruptedException("Shutdown in progress");

        // getting next profile
        try {
          entry = new CrawlProfile(this.profilesActiveCrawls.get(handle));
        } catch (final IOException e) {
          continue;
        } catch (final RowSpaceExceededException e) {
          continue;
        }
        if (!((entry.name().equals(CRAWL_PROFILE_PROXY))
            || (entry.name().equals(CRAWL_PROFILE_REMOTE))
            || (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT))
            || (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT))
            || (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA))
            || (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA))
            || (entry.name().equals(CRAWL_PROFILE_SURROGATE)))) {
          final CrawlProfile p = new CrawlProfile(entry);
          this.profilesPassiveCrawls.put(UTF8.getBytes(p.handle()), p);
          this.profilesActiveCrawls.remove(handle);
          hasDoneSomething = true;
        }
      }
    } catch (final kelondroException e) {
      resetProfiles();
      hasDoneSomething = true;
    }
    return hasDoneSomething;
  }
Beispiel #2
0
  public CrawlSwitchboard(final String networkName, final Log log, final File queuesRoot) {

    log.logInfo("Initializing Word Index for the network '" + networkName + "'.");

    if (networkName == null || networkName.length() == 0) {
      log.logSevere("no network name given - shutting down");
      System.exit(0);
    }
    this.log = log;
    this.profilesActiveCrawlsCache =
        Collections.synchronizedMap(new TreeMap<byte[], CrawlProfile>(Base64Order.enhancedCoder));

    // make crawl profiles database and default profiles
    this.queuesRoot = queuesRoot;
    this.queuesRoot.mkdirs();
    this.log.logConfig("Initializing Crawl Profiles");

    final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
    this.profilesActiveCrawls = loadFromDB(profilesActiveFile);
    for (final byte[] handle : this.profilesActiveCrawls.keySet()) {
      CrawlProfile p;
      try {
        p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
      } catch (final IOException e) {
        p = null;
      } catch (final RowSpaceExceededException e) {
        p = null;
      }
      if (p == null) continue;
      if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTMATCH))) {
        removeActive(handle);
        Log.logWarning(
            "CrawlProfiles",
            "removed Profile "
                + p.handle()
                + ": "
                + p.name()
                + " from active crawls since "
                + CrawlProfile.FILTER_URL_MUSTMATCH
                + " is no valid regular expression: "
                + p.get(CrawlProfile.FILTER_URL_MUSTMATCH));
      } else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH))) {
        removeActive(handle);
        Log.logWarning(
            "CrawlProfiles",
            "removed Profile "
                + p.handle()
                + ": "
                + p.name()
                + " from active crawls since "
                + CrawlProfile.FILTER_URL_MUSTNOTMATCH
                + " is no valid regular expression: "
                + p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH));
      } else {
        Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
      }
    }
    initActiveCrawlProfiles();
    log.logInfo(
        "Loaded active crawl profiles from file "
            + profilesActiveFile.getName()
            + ", "
            + this.profilesActiveCrawls.size()
            + " entries");

    final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES);
    this.profilesPassiveCrawls = loadFromDB(profilesPassiveFile);
    for (final byte[] handle : this.profilesPassiveCrawls.keySet()) {
      CrawlProfile p;
      try {
        p = new CrawlProfile(this.profilesPassiveCrawls.get(handle));
        Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
      } catch (final IOException e) {
        continue;
      } catch (final RowSpaceExceededException e) {
        continue;
      }
    }
    log.logInfo(
        "Loaded passive crawl profiles from file "
            + profilesPassiveFile.getName()
            + ", "
            + this.profilesPassiveCrawls.size()
            + " entries"
            + ", "
            + profilesPassiveFile.length() / 1024);
  }
Beispiel #3
0
  private void initActiveCrawlProfiles() {
    this.defaultProxyProfile = null;
    this.defaultRemoteProfile = null;
    this.defaultTextSnippetLocalProfile = null;
    this.defaultTextSnippetGlobalProfile = null;
    this.defaultMediaSnippetLocalProfile = null;
    this.defaultMediaSnippetGlobalProfile = null;
    this.defaultSurrogateProfile = null;
    CrawlProfile profile;
    String name;
    try {
      for (final byte[] handle : this.profilesActiveCrawls.keySet()) {
        profile = new CrawlProfile(this.profilesActiveCrawls.get(handle));
        name = profile.name();
        if (name.equals(CRAWL_PROFILE_PROXY)) this.defaultProxyProfile = profile;
        if (name.equals(CRAWL_PROFILE_REMOTE)) this.defaultRemoteProfile = profile;
        if (name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT))
          this.defaultTextSnippetLocalProfile = profile;
        if (name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT))
          this.defaultTextSnippetGlobalProfile = profile;
        if (name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA))
          this.defaultMediaSnippetLocalProfile = profile;
        if (name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA))
          this.defaultMediaSnippetGlobalProfile = profile;
        if (name.equals(CRAWL_PROFILE_SURROGATE)) this.defaultSurrogateProfile = profile;
      }
    } catch (final Exception e) {
      this.profilesActiveCrawls.clear();
      this.defaultProxyProfile = null;
      this.defaultRemoteProfile = null;
      this.defaultTextSnippetLocalProfile = null;
      this.defaultTextSnippetGlobalProfile = null;
      this.defaultMediaSnippetLocalProfile = null;
      this.defaultMediaSnippetGlobalProfile = null;
      this.defaultSurrogateProfile = null;
    }

    if (this.defaultProxyProfile == null) {
      // generate new default entry for proxy crawling
      this.defaultProxyProfile =
          new CrawlProfile(
              "proxy",
              null,
              CrawlProfile.MATCH_ALL_STRING,
              CrawlProfile.MATCH_NEVER_STRING,
              CrawlProfile.MATCH_ALL_STRING,
              CrawlProfile.MATCH_NEVER_STRING,
              "",
              0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
              true,
              CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
              -1,
              false,
              true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
              true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
              true,
              false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/,
              true,
              true,
              true,
              CacheStrategy.IFFRESH);
      this.profilesActiveCrawls.put(
          UTF8.getBytes(this.defaultProxyProfile.handle()), this.defaultProxyProfile);
    }
    if (this.defaultRemoteProfile == null) {
      // generate new default entry for remote crawling
      this.defaultRemoteProfile =
          new CrawlProfile(
              CRAWL_PROFILE_REMOTE,
              null,
              CrawlProfile.MATCH_ALL_STRING,
              CrawlProfile.MATCH_ALL_STRING,
              CrawlProfile.MATCH_NEVER_STRING,
              "",
              CrawlProfile.MATCH_NEVER_STRING,
              0,
              true,
              -1,
              -1,
              true,
              true,
              true,
              false,
              false,
              true,
              true,
              false,
              CacheStrategy.IFFRESH);
      this.profilesActiveCrawls.put(
          UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile);
    }
    if (this.defaultTextSnippetLocalProfile == null) {
      // generate new default entry for snippet fetch and optional crawling
      this.defaultTextSnippetLocalProfile =
          new CrawlProfile(
              CRAWL_PROFILE_SNIPPET_LOCAL_TEXT,
              null,
              CrawlProfile.MATCH_ALL_STRING,
              CrawlProfile.MATCH_NEVER_STRING,
              CrawlProfile.MATCH_ALL_STRING,
              CrawlProfile.MATCH_NEVER_STRING,
              "",
              0,
              true,
              CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
              -1,
              true,
              false,
              false,
              true,
              false,
              true,
              true,
              false,
              CacheStrategy.IFEXIST);
      this.profilesActiveCrawls.put(
          UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()),
          this.defaultTextSnippetLocalProfile);
    }
    if (this.defaultTextSnippetGlobalProfile == null) {
      // generate new default entry for snippet fetch and optional crawling
      this.defaultTextSnippetGlobalProfile =
          new CrawlProfile(
              CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT,
              null,
              CrawlProfile.MATCH_ALL_STRING,
              CrawlProfile.MATCH_NEVER_STRING,
              CrawlProfile.MATCH_ALL_STRING,
              CrawlProfile.MATCH_NEVER_STRING,
              "",
              0,
              true,
              CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
              -1,
              true,
              true,
              true,
              true,
              false,
              true,
              true,
              false,
              CacheStrategy.IFEXIST);
      this.profilesActiveCrawls.put(
          UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
          this.defaultTextSnippetGlobalProfile);
    }
    this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
    if (this.defaultMediaSnippetLocalProfile == null) {
      // generate new default entry for snippet fetch and optional crawling
      this.defaultMediaSnippetLocalProfile =
          new CrawlProfile(
              CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA,
              null,
              CrawlProfile.MATCH_ALL_STRING,
              CrawlProfile.MATCH_NEVER_STRING,
              CrawlProfile.MATCH_ALL_STRING,
              CrawlProfile.MATCH_NEVER_STRING,
              "",
              0,
              true,
              CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
              -1,
              true,
              false,
              false,
              true,
              false,
              true,
              true,
              false,
              CacheStrategy.IFEXIST);
      this.profilesActiveCrawls.put(
          UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()),
          this.defaultMediaSnippetLocalProfile);
    }
    if (this.defaultMediaSnippetGlobalProfile == null) {
      // generate new default entry for snippet fetch and optional crawling
      this.defaultMediaSnippetGlobalProfile =
          new CrawlProfile(
              CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
              null,
              CrawlProfile.MATCH_ALL_STRING,
              CrawlProfile.MATCH_NEVER_STRING,
              CrawlProfile.MATCH_ALL_STRING,
              CrawlProfile.MATCH_NEVER_STRING,
              "",
              0,
              true,
              CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
              -1,
              true,
              false,
              true,
              true,
              false,
              true,
              true,
              false,
              CacheStrategy.IFEXIST);
      this.profilesActiveCrawls.put(
          UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()),
          this.defaultMediaSnippetGlobalProfile);
    }
    if (this.defaultSurrogateProfile == null) {
      // generate new default entry for surrogate parsing
      this.defaultSurrogateProfile =
          new CrawlProfile(
              CRAWL_PROFILE_SURROGATE,
              null,
              CrawlProfile.MATCH_ALL_STRING,
              CrawlProfile.MATCH_NEVER_STRING,
              CrawlProfile.MATCH_ALL_STRING,
              CrawlProfile.MATCH_NEVER_STRING,
              "",
              0,
              false,
              CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
              -1,
              true,
              true,
              false,
              false,
              false,
              true,
              true,
              false,
              CacheStrategy.NOCACHE);
      this.profilesActiveCrawls.put(
          UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile);
    }
  }