public boolean clear() throws InterruptedException { this.profilesActiveCrawlsCache.clear(); CrawlProfile entry; boolean hasDoneSomething = false; try { for (final byte[] handle : this.profilesActiveCrawls.keySet()) { // check for interruption if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress"); // getting next profile try { entry = new CrawlProfile(this.profilesActiveCrawls.get(handle)); } catch (final IOException e) { continue; } catch (final RowSpaceExceededException e) { continue; } if (!((entry.name().equals(CRAWL_PROFILE_PROXY)) || (entry.name().equals(CRAWL_PROFILE_REMOTE)) || (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) || (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) || (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) || (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) || (entry.name().equals(CRAWL_PROFILE_SURROGATE)))) { final CrawlProfile p = new CrawlProfile(entry); this.profilesPassiveCrawls.put(UTF8.getBytes(p.handle()), p); this.profilesActiveCrawls.remove(handle); hasDoneSomething = true; } } } catch (final kelondroException e) { resetProfiles(); hasDoneSomething = true; } return hasDoneSomething; }
public CrawlSwitchboard(final String networkName, final Log log, final File queuesRoot) { log.logInfo("Initializing Word Index for the network '" + networkName + "'."); if (networkName == null || networkName.length() == 0) { log.logSevere("no network name given - shutting down"); System.exit(0); } this.log = log; this.profilesActiveCrawlsCache = Collections.synchronizedMap(new TreeMap<byte[], CrawlProfile>(Base64Order.enhancedCoder)); // make crawl profiles database and default profiles this.queuesRoot = queuesRoot; this.queuesRoot.mkdirs(); this.log.logConfig("Initializing Crawl Profiles"); final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES); this.profilesActiveCrawls = loadFromDB(profilesActiveFile); for (final byte[] handle : this.profilesActiveCrawls.keySet()) { CrawlProfile p; try { p = new CrawlProfile(this.profilesActiveCrawls.get(handle)); } catch (final IOException e) { p = null; } catch (final RowSpaceExceededException e) { p = null; } if (p == null) continue; if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTMATCH))) { removeActive(handle); Log.logWarning( "CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name() + " from active crawls since " + CrawlProfile.FILTER_URL_MUSTMATCH + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTMATCH)); } else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH))) { removeActive(handle); Log.logWarning( "CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name() + " from active crawls since " + CrawlProfile.FILTER_URL_MUSTNOTMATCH + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH)); } else { Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name()); } } initActiveCrawlProfiles(); log.logInfo( "Loaded active crawl profiles from file " + profilesActiveFile.getName() + ", " + this.profilesActiveCrawls.size() + " entries"); final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES); this.profilesPassiveCrawls = loadFromDB(profilesPassiveFile); for (final byte[] handle : this.profilesPassiveCrawls.keySet()) { CrawlProfile p; try { p = new CrawlProfile(this.profilesPassiveCrawls.get(handle)); Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name()); } catch (final IOException e) { continue; } catch (final RowSpaceExceededException e) { continue; } } log.logInfo( "Loaded passive crawl profiles from file " + profilesPassiveFile.getName() + ", " + this.profilesPassiveCrawls.size() + " entries" + ", " + profilesPassiveFile.length() / 1024); }
private void initActiveCrawlProfiles() { this.defaultProxyProfile = null; this.defaultRemoteProfile = null; this.defaultTextSnippetLocalProfile = null; this.defaultTextSnippetGlobalProfile = null; this.defaultMediaSnippetLocalProfile = null; this.defaultMediaSnippetGlobalProfile = null; this.defaultSurrogateProfile = null; CrawlProfile profile; String name; try { for (final byte[] handle : this.profilesActiveCrawls.keySet()) { profile = new CrawlProfile(this.profilesActiveCrawls.get(handle)); name = profile.name(); if (name.equals(CRAWL_PROFILE_PROXY)) this.defaultProxyProfile = profile; if (name.equals(CRAWL_PROFILE_REMOTE)) this.defaultRemoteProfile = profile; if (name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) this.defaultTextSnippetLocalProfile = profile; if (name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT)) this.defaultTextSnippetGlobalProfile = profile; if (name.equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA)) this.defaultMediaSnippetLocalProfile = profile; if (name.equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) this.defaultMediaSnippetGlobalProfile = profile; if (name.equals(CRAWL_PROFILE_SURROGATE)) this.defaultSurrogateProfile = profile; } } catch (final Exception e) { this.profilesActiveCrawls.clear(); this.defaultProxyProfile = null; this.defaultRemoteProfile = null; this.defaultTextSnippetLocalProfile = null; this.defaultTextSnippetGlobalProfile = null; this.defaultMediaSnippetLocalProfile = null; this.defaultMediaSnippetGlobalProfile = null; this.defaultSurrogateProfile = null; } if (this.defaultProxyProfile == null) { // generate new default entry for proxy crawling this.defaultProxyProfile = new CrawlProfile( "proxy", null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, true, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false, true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/, true, false /*getConfigBool(PROXY_INDEXING_REMOTE, false)*/, true, true, true, CacheStrategy.IFFRESH); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultProxyProfile.handle()), this.defaultProxyProfile); } if (this.defaultRemoteProfile == null) { // generate new default entry for remote crawling this.defaultRemoteProfile = new CrawlProfile( CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", CrawlProfile.MATCH_NEVER_STRING, 0, true, -1, -1, true, true, true, false, false, true, true, false, CacheStrategy.IFFRESH); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultRemoteProfile.handle()), this.defaultRemoteProfile); } if (this.defaultTextSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling this.defaultTextSnippetLocalProfile = new CrawlProfile( CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()), this.defaultTextSnippetLocalProfile); } if (this.defaultTextSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling this.defaultTextSnippetGlobalProfile = new CrawlProfile( CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, false, true, true, false, CacheStrategy.IFEXIST); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()), this.defaultTextSnippetGlobalProfile); } this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST); if (this.defaultMediaSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling this.defaultMediaSnippetLocalProfile = new CrawlProfile( CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, true, true, false, CacheStrategy.IFEXIST); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()), this.defaultMediaSnippetLocalProfile); } if (this.defaultMediaSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling this.defaultMediaSnippetGlobalProfile = new CrawlProfile( CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, true, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, false, true, true, false, CacheStrategy.IFEXIST); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()), this.defaultMediaSnippetGlobalProfile); } if (this.defaultSurrogateProfile == null) { // generate new default entry for surrogate parsing this.defaultSurrogateProfile = new CrawlProfile( CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, "", 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, true, true, false, CacheStrategy.NOCACHE); this.profilesActiveCrawls.put( UTF8.getBytes(this.defaultSurrogateProfile.handle()), this.defaultSurrogateProfile); } }