/** * Tests that: * * <ol> * <li>The given domain configuration and job are not null. * <li>The job does not already contain the given domain configuration. * <li>The domain configuration has the same order xml name as the first inserted domain config. * </ol> * * @param job a given Job * @param cfg a given DomainConfiguration * @return true, if the given DomainConfiguration can be inserted into the given job */ private boolean checkAddDomainConfInvariant(Job job, DomainConfiguration cfg) { ArgumentNotValid.checkNotNull(job, "job"); ArgumentNotValid.checkNotNull(cfg, "cfg"); // check if domain in DomainConfiguration cfg is not already in this job // domainName is used as key in domainConfigurationMap if (job.getDomainConfigurationMap().containsKey(cfg.getDomainName())) { if (log.isDebugEnabled()) { log.debug("Job already has a configuration for Domain '" + cfg.getDomainName() + "'."); } return false; } // check if template is same as this job. String orderXMLname = job.getOrderXMLName(); if (!orderXMLname.equals(cfg.getOrderXmlName())) { if (log.isDebugEnabled()) { log.debug( "This Job only accept configurations " + "using the harvest template '" + orderXMLname + "'. This configuration uses the harvest template '" + cfg.getOrderXmlName() + "'."); } return false; } return true; }
/** * Assign a StoreMessage to a specific filename. If the filename is already associated with a * StoreMessage, then this StoreMessage will be overwritten by the new StoreMessage. * * @param filename The name of the file to have a StoreMessage assigned. * @param msg The StoreMessage to be assigned to a file. * @throws ArgumentNotValid If the StoreMessage is null or if the filename is either null or the * empty string. */ @Override public void setReplyInfo(String filename, StoreMessage msg) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(msg, "StoreMessage msg"); ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); // put into the map, and overwrite any existing mapping. storeEntries.put(filename, msg); }
@Override public boolean postProcess(InputStream input, OutputStream output) { ArgumentNotValid.checkNotNull(input, "InputStream input"); ArgumentNotValid.checkNotNull(output, "OutputStream output"); // Let the loaded job handle the post processing. loadBatchJob(); return loadedJob.postProcess(input, output); }
/** * Returns the ReplicaStoreState of a given file in a specific replica. * * @param filename The name of the file for the ReplicaStoreState. * @param replicaChannelName The name of the identification channel for uniquely identifying the * replica of for the ReplicaStoreState. * @return The ReplicaStoreState of a given file in a specific replica. * @throws ArgumentNotValid If the filename or the replica id is null or the empty string. */ @Override public ReplicaStoreState getState(String filename, String replicaChannelName) throws ArgumentNotValid { ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); ArgumentNotValid.checkNotNullOrEmpty(replicaChannelName, "String replicaChannelName"); Replica rep = Channels.retrieveReplicaFromIdentifierChannel(replicaChannelName); // retrieve the ReplicaStoreState from the database. return database.getReplicaStoreState(filename, rep.getId()); }
/** * Sets the checksum of a given file. * * <p>It should not be possible to change the checksum in the database through arcrepository. * * @param filename The name of the file to have the checksum changed. * @param checksum The new checksum for the file. * @throws ArgumentNotValid If either the filename or the checksum is either null or the empty * string. * @throws IllegalState Always, since it is not allowed for arcrepository to change the checksum * of a completed upload. */ @Override public void setCheckSum(String filename, String checksum) throws ArgumentNotValid, IllegalState { ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); ArgumentNotValid.checkNotNullOrEmpty(checksum, "String checksum"); // This will not be implemented. throw new IllegalState( "It is not possible to change the checksum of a " + " file in the database! Only the checksum of a specific " + "replicafileinfo."); }
/** * Method for adding an entry for administration. * * @param filename The name of the file to be stored. * @param msg The StoreMessage of the entry. * @param checksum The checksum of the entry. * @throws ArgumentNotValid If either the filename or checksum is either null or the empty string. */ @Override public void addEntry(String filename, StoreMessage msg, String checksum) throws ArgumentNotValid { ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); ArgumentNotValid.checkNotNullOrEmpty(checksum, "String checksum"); // insert this into the entries map. storeEntries.put(filename, msg); // insert into database. database.insertNewFileForUpload(filename, checksum); }
/** * Retrieves a set with the name of the files with a specific ReplicaStoreState in a specific * replica. * * @param rep The replica where the files belong. * @param state The ReplicaStoreState for the files. * @return A set with the names of the files with a specific ReplicaStoreState in a specific * replica. * @throws ArgumentNotValid If the Replica or the ReplicaStoreState is null. */ @Override public Set<String> getAllFileNames(Replica rep, ReplicaStoreState state) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(rep, "Replica rep"); ArgumentNotValid.checkNotNull(state, "ReplicaStoreState state"); // initialise the set Set<String> res = new HashSet<String>(); // put the collection of filenames into the set. res.addAll(database.retrieveFilenamesForReplicaEntries(rep.getId(), state)); // return the set. return res; }
/** * Message to signal from a BitarchiveServer to the BitarchiveMonitorServer that the Bit Archive * Application identified by BA_ApplicationId has completed its part of the batch job. * * <p>Holds status information: list of files processed and a list of ARC files (file names) on * which the batch job failed. * * @param to the channel to which this message is to be sent (must be a BAMON channel) * @param baAppId Identifier for the machine sending this message, usually containing the IP * address and http port number * @param originatingBatchMsgId the Id field from the original batch message * @param rf he remote file reference containing the output of the batch job (may be null if no * output is generated). * @throws ArgumentNotValid If the BA_ApplicationId or the originatingBatchMsgId are null or * empty, or if the channel 'to' is null. */ public BatchEndedMessage( ChannelID to, String baAppId, String originatingBatchMsgId, RemoteFile rf) throws ArgumentNotValid { super(to, Channels.getError()); ArgumentNotValid.checkNotNull(to, "ChannelID to"); ArgumentNotValid.checkNotNullOrEmpty(baAppId, "String baAppId"); ArgumentNotValid.checkNotNullOrEmpty(originatingBatchMsgId, "String originatingBatchMsgId"); this.baApplicationId = baAppId; this.originatingBatchMsgId = originatingBatchMsgId; this.rf = rf; }
/** * Sets the store state of an entry to a specific value. * * @param filename The name of the file for the entry. * @param repChannelId The identification channel of the replica for the entry. * @param state The new state for the entry. * @throws ArgumentNotValid If the ReplicaStoreState is null, or if either the filename or the * replica identification channel is either null or the empty string. */ @Override public void setState(String filename, String repChannelId, ReplicaStoreState state) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(state, "ReplicaStoreState state"); ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); ArgumentNotValid.checkNotNullOrEmpty(repChannelId, "String repChannelId"); // retrieve the replica Replica rep = Channels.retrieveReplicaFromIdentifierChannel(repChannelId); // update the database. database.setReplicaStoreState(filename, rep.getId(), state); }
/** * Message to signal from a BitarchiveServer to the BitarchiveMonitorServer that the Bit Archive * Application identified by BA_ApplicationId has completed its part of the batch job. * * <p>Holds status information: list of files processed and a list of ARC files (file names) on * which the batch job failed. * * @param to the channel to which this message is to be sent (must be a BAMON channel) * @param originatingBatchMsgId the Id field from the original batch message * @param status The object containing status info. */ public BatchEndedMessage(ChannelID to, String originatingBatchMsgId, BatchStatus status) { super(to, Channels.getError()); ArgumentNotValid.checkNotNull(to, "to"); ArgumentNotValid.checkNotNullOrEmpty(originatingBatchMsgId, "String originatingBatchMsgId"); ArgumentNotValid.checkNotNull(status, "BatchStatus status"); this.originatingBatchMsgId = originatingBatchMsgId; this.baApplicationId = status.getBitArchiveAppId(); this.rf = status.getResultFile(); this.noOfFilesProcessed = status.getNoOfFilesProcessed(); this.filesFailed = status.getFilesFailed(); this.exceptions = status.getExceptions(); }
/** * Determines whether a given file in a specific replica has a valid replica store state. By valid * means a replica store state other that UNKNOWN_UPLOAD_STATE. * * <p>TODO Find out if the assumption that all upload states besides UNKNOWN_UPLOAD_STATE are * acceptable! * * @param filename The name of the file for the ReplicaStoreState. * @param repChannelId The identification channel of the replica for the ReplicaStoreState. * @return Whether a given file in a specific replica has a valid store state. * @throws ArgumentNotValid If either the filenames or the replica identification channel is null * or the empty string. */ @Override public boolean hasState(String filename, String repChannelId) throws ArgumentNotValid { ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); ArgumentNotValid.checkNotNullOrEmpty(repChannelId, "String repChannelId"); // retrieve the replica Replica rep = Channels.retrieveReplicaFromIdentifierChannel(repChannelId); // retrieve the state for the entry for the replica and filename ReplicaStoreState state = database.getReplicaStoreState(filename, rep.getId()); // return whether the entry has a known upload state return state != ReplicaStoreState.UNKNOWN_UPLOAD_STATE; }
public ExportFrontierReportCsvQuery(ServletRequest req) { String jobIdStr = UI_FIELD.JOB_ID.getValue(req); ArgumentNotValid.checkNotNullOrEmpty(jobIdStr, UI_FIELD.JOB_ID.name()); jobId = Long.parseLong(jobIdStr); }
/** * Initialise the batch job. * * @param regexp The regexp to match in the crawl.log lines. */ public CrawlLogLinesMatchingRegexp(String regexp) { ArgumentNotValid.checkNotNullOrEmpty(regexp, "regexp"); this.regexp = regexp; /** One week in milliseconds. */ batchJobTimeout = 7 * Constants.ONE_DAY_IN_MILLIES; }
/** * Method for telling whether a file entry exists. * * @param filename The name of the file, the existence of whose entry is to be determined. * @return Whether the entry exists. * @throws ArgumentNotValid If the filename is either null or empty. */ @Override public boolean hasEntry(String filename) throws ArgumentNotValid { ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); // See if the file can be found in the database. return database.existsFileInDB(filename); }
/** * Retrieves the StoreMessage of a specific file. * * @param filename The name of the file whose StoreMessage should be retrieved. * @return The StoreMessage corresponding to the file. A null is returned if the corresponding * StoreMessage is not found. * @throws ArgumentNotValid If the filename is either null or the empty string. */ @Override public StoreMessage removeReplyInfo(String filename) throws ArgumentNotValid { ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); // extract the entry from the return storeEntries.remove(filename); }
/** * Determines whether the StoreMessage of a given file exists. * * @param filename The name of the file to which the existence of the StoreMessage should be * determined. * @return Whether the StoreMessage of the file exists. * @throws ArgumentNotValid If the filename is null or the empty string. */ @Override public boolean hasReplyInfo(String filename) throws ArgumentNotValid { ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); // check if a entry for the file can be found in the map. return storeEntries.containsKey(filename); }
/** * Set the axis range. * * @param axis a numberAxis * @param range a range */ private void setAxisRange(NumberAxis axis, double[] range) { if (range == null || range.length != 2) { axis.setAutoRange(true); } else { double lower = range[0]; double upper = range[1]; ArgumentNotValid.checkTrue(lower < upper, "Incorrect range"); axis.setAutoRange(false); axis.setRange(new Range(lower, upper)); } }
/** * Calls the Unix sort command with the options <code>$filesNames -o * $outputfile -T WaybackSettings#WAYBACK_AGGREGATOR_TEMP_DIR. * * Sets the LC_ALL environment variable before making the call. * * @param files The files to merge and sort * @param outputFile The resulting sorted file * @param additionalArgs A list af extra arguments, which (if different from * null) are added to the sort call.<p> Note: If any * of the args contain a whitespace the call will * fail. */ private void processFiles(File[] files, File outputFile, List<String> additionalArgs) { if (files.length == 0) { // Empty file list will cause sort to wait for further input, // and the call will therefore never return return; } Process p = null; try { List<String> inputFileList = new LinkedList<String>(); for (int i = 0; i < files.length; i++) { if (files[i].exists() && files[i].isFile()) { inputFileList.add(files[i].getCanonicalPath()); } else { log.warn( "File " + files[i] + " doesn't exist or isn't a regular file, " + "dropping from list of files to " + "sort and merge"); } } List<String> cmd = new LinkedList<String>(); // Prepare to run the unix sort command, see sort manual page for // details cmd.add("sort"); cmd.addAll(inputFileList); cmd.add("-o"); cmd.add(outputFile.getCanonicalPath()); cmd.add("-T"); cmd.add(Settings.get(WaybackSettings.WAYBACK_AGGREGATOR_TEMP_DIR)); if (additionalArgs != null && !additionalArgs.isEmpty()) { for (String argument : additionalArgs) { ArgumentNotValid.checkTrue( argument.indexOf(' ') == -1, "The argument '" + argument + "' contains spaces, this isn't allowed "); } cmd.addAll(additionalArgs); } ProcessBuilder pb = new ProcessBuilder(cmd); // Reset all locale definitions pb.environment().put("LC_ALL", "C"); // Run the command in the user.dir directory pb.directory(new File(System.getProperty("user.dir"))); p = pb.start(); p.waitFor(); if (p.exitValue() != 0) { log.error("Failed to sort index files, sort exited with " + "return code " + p.exitValue()); } } catch (Exception e) { log.error("Failed to aggregate indexes ", e); } }
/** * Unpacks and calls accept() on the message object. * * <p>This method catches <b>all</b> exceptions and logs them. * * @param msg a ObjectMessage */ public void onMessage(Message msg) { ArgumentNotValid.checkNotNull(msg, "Message msg"); log.trace("Message received:\n{}", msg.toString()); try { ((ArchiveMessage) JMSConnection.unpack(msg)).accept(this); } catch (ClassCastException e) { log.warn("Invalid message type", e); } catch (Throwable t) { log.warn("Error processing message '{}'", msg, t); } }
/** * Creates a new NetarkivetMessage. * * @param to the initial receiver of the message * @param replyTo the initial sender of the message * @throws ArgumentNotValid if to==replyTo, the replyTo parameter is a topic instead of a queue, * or there is a null parameter. */ protected NetarkivetMessage(ChannelID to, ChannelID replyTo) { ArgumentNotValid.checkNotNull(to, "to"); ArgumentNotValid.checkNotNull(replyTo, "replyTo"); if (to.getName().equals(replyTo.getName())) { throw new ArgumentNotValid("to and replyTo should not be equal."); } // Have not implemented replying to a topic because there is no use // for it in our current architecture if (Channels.isTopic(replyTo.getName())) { throw new ArgumentNotValid( "Reply channel must be queue but " + replyTo.toString() + " is a Topic"); } this.to = to; this.replyTo = replyTo; this.id = null; this.replyOfId = null; }
/** * Retrieves the checksum of a given file. * * @param filename The name of the file, whose checksum should be retrieved. * @return The checksum of the file. * @throws ArgumentNotValid If the filename is either null or the empty string. */ @Override public String getCheckSum(String filename) throws ArgumentNotValid { ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename"); // Ensure that we have the file requested. if (!hasEntry(filename)) { throw new UnknownID("Don't know anything about file '" + filename + "'"); } // Retrieve the checksum for a specific entry. return database.getChecksum(filename); }
/** * Create a new batch job that runs the loaded class. * * @param classFile the classfile for the batch job we want to run. * @param arguments The arguments for the batchjobs. This can be null. * @throws ArgumentNotValid If the classfile is null. */ public LoadableFileBatchJob(File classFile, List<String> arguments) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(classFile, "File classFile"); fileContents = FileUtils.readBinaryFile(classFile); fileName = classFile.getName(); if (arguments == null) { this.args = new ArrayList<String>(); } else { this.args = arguments; } loadBatchJob(); }
/** * Process a record on crawl log concerning the given domain to result. * * @param record The record to process. * @param os The output stream for the result. * @throws ArgumentNotValid on null parameters * @throws IOFailure on trouble processing the record. */ @Override public void processRecord(ArchiveRecordBase record, OutputStream os) { ArgumentNotValid.checkNotNull(record, "ArchiveRecordBase record"); ArgumentNotValid.checkNotNull(os, "OutputStream os"); BufferedReader arcreader = new BufferedReader(new InputStreamReader(record.getInputStream())); try { for (String line = arcreader.readLine(); line != null; line = arcreader.readLine()) { if (line.matches(regexp)) { os.write(line.getBytes("UTF-8")); os.write('\n'); } } } catch (IOException e) { throw new IOFailure("Unable to process (w)arc record", e); } finally { try { arcreader.close(); } catch (IOException e) { log.warn("unable to close arcreader probably", e); } } }
/** * Set the actual time when this job was started. * * <p>Sends a notification, if actualStart is set to a time after actualStop. * * @param actualStart A Date object representing the time when this job was started. */ public void setActualStart(Date actualStart) { ArgumentNotValid.checkNotNull(actualStart, "actualStart"); if (actualStop != null && actualStop.before(actualStart)) { log.warn( "Job(" + getJobID() + "): Start time (" + actualStart + ") is after end time: " + actualStop); } this.actualStart = (Date) actualStart.clone(); }
/** * Set the actual time when this job was stopped/completed. Sends a notification, if actualStop is * set to a time before actualStart. * * @param actualStop A Date object representing the time when this job was stopped. * @throws ArgumentNotValid */ public void setActualStop(Date actualStop) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(actualStop, "actualStop"); if (actualStart == null) { log.warn("Job(" + getJobID() + "): actualStart should be defined before setting actualStop"); } else if (actualStop.before(actualStart)) { log.warn( "Job(" + getJobID() + "): actualStop (" + actualStop + ") is before actualStart: " + actualStart); } this.actualStop = (Date) actualStop.clone(); }
/** * Set the seedlist of the job from the seedList argument. Individual seeds are separated by a * '\n' character. Duplicate seeds are removed. * * @param seedList List of seeds as one String */ public void setSeedList(String seedList) { ArgumentNotValid.checkNotNullOrEmpty(seedList, "seedList"); seedListSet = new HashSet<>(); BufferedReader reader = new BufferedReader(new StringReader(seedList)); String seed; try { while ((seed = reader.readLine()) != null) { seedListSet.add(seed); // add to seedlist if not already there } } catch (IOException e) { // This never happens, as we're reading from a string! throw new IOFailure("IOException reading from seed string", e); } finally { IOUtils.closeQuietly(reader); } }
/** * Sets status of this job. * * @param newStatus Must be one of the values STATUS_NEW, ..., STATUS_FAILED * @throws ArgumentNotValid in case of invalid status argument or invalid status change */ public void setStatus(JobStatus newStatus) { ArgumentNotValid.checkNotNull(newStatus, "newStatus"); if (!status.legalChange(newStatus)) { final String message = "Status change from " + status + " to " + newStatus + " is not allowed"; log.debug(message); throw new ArgumentNotValid(message); } if ((this.status == JobStatus.NEW || this.status == JobStatus.RESUBMITTED) && newStatus == JobStatus.SUBMITTED) { orderXMLdoc.configureQuotaEnforcer( maxObjectsIsSetByQuotaEnforcer, forceMaxBytesPerDomain, forceMaxObjectsPerDomain); } if (this.status == JobStatus.SUBMITTED && newStatus == JobStatus.STARTED) { setActualStart(new Date()); } if (this.status == JobStatus.STARTED && (newStatus == JobStatus.DONE || newStatus == JobStatus.FAILED)) { setActualStop(new Date()); } status = newStatus; }
/** * Package private constructor for common initialisation. * * @param harvestID the id of the harvestdefinition * @param cfg the configuration to base the Job on * @param orderXMLdoc * @param channel the channel on which the job will be submitted. * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, * overrides individual configuration settings. -1 means no limit * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for * no limit. * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job * @param harvestNum the run number of the harvest definition * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit < * -1 */ public Job( Long harvestID, DomainConfiguration cfg, HeritrixTemplate orderXMLdoc, HarvestChannel channel, long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime, int harvestNum) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(cfg, "cfg"); ArgumentNotValid.checkNotNull(harvestID, "harvestID"); ArgumentNotValid.checkNotNegative(harvestID, "harvestID"); ArgumentNotValid.checkNotNull(channel, "channel"); if (forceMaxObjectsPerDomain < -1) { String msg = "forceMaxObjectsPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain < -1) { String msg = "forceMaxBytesPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain == 0L) { log.warn( "forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain"); } if (forceMaxObjectsPerDomain == 0L) { log.warn( "forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain"); } // setup initial members domainConfigurationMap = new HashMap<>(); origHarvestDefinitionID = harvestID; orderXMLname = cfg.getOrderXmlName(); this.orderXMLdoc = orderXMLdoc; setHarvestChannel(channel); long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects()); setMaxObjectsPerDomain(maxObjects); configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain); long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes()); setMaxBytesPerDomain(maxBytes); configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain); long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); maxCountObjects = expectation; minCountObjects = expectation; this.harvestNum = harvestNum; addConfiguration(cfg); setMaxJobRunningTime(forceMaxJobRunningTime); setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT)); setAttributes(cfg.getAttributesAndTypes()); orderXMLdoc.enableOrDisableDeduplication( Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED)); status = JobStatus.NEW; }
/** * Set the orderxml for this job. * * @param doc A orderxml to be used by this job */ public void setOrderXMLDoc(HeritrixTemplate doc) { ArgumentNotValid.checkNotNull(doc, "doc"); this.orderXMLdoc = doc; }
/** * Adds a configuration to this Job. Seedlists and settings are updated accordingly. * * @param cfg the configuration to add * @throws ArgumentNotValid if cfg is null or cfg uses a different orderxml than this job or if * this job already contains a configuration associated with domain of configuration cfg. */ public void addConfiguration(DomainConfiguration cfg) { ArgumentNotValid.checkNotNull(cfg, "cfg"); if (domainConfigurationMap.containsKey(cfg.getDomainName())) { throw new ArgumentNotValid( "Job already has a configuration for Domain " + cfg.getDomainName()); } if (log.isTraceEnabled()) { log.trace("Adding configuration '{}' to job '{}'", cfg, cfg.getName()); } if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } if (!cfg.getOrderXmlName().equals(getOrderXMLName())) { throw new ArgumentNotValid( "Job requires the orderxml file:'" + getOrderXMLName() + "' not:'" + cfg.getOrderXmlName() + "' used by the configuration:'" + cfg.getName()); } domainConfigurationMap.put(cfg.getDomainName(), cfg.getName()); // Add the seeds from the configuration to the Job seeds. // Take care of duplicates. for (Iterator<SeedList> itt = cfg.getSeedLists(); itt.hasNext(); ) { SeedList seed = itt.next(); List<String> seeds = seed.getSeeds(); for (String seedUrl : seeds) { seedListSet.add(seedUrl); // duplicates is silently ignored // TODO remove when heritrix implements this functionality // try to convert a seed into a Internationalized Domain Name try { String seedASCII = seedUrl; // It is rare to see these seeds, but they need to be // correctly idnaized if (seedUrl.contains(":") || seedUrl.contains("/")) { String normalizedUrl = seedUrl; if (!normalizedUrl.matches("^[a-zA-Z]+:.*")) { // If no protocol is given, assume http normalizedUrl = "http://" + normalizedUrl; } URL url = new URL(normalizedUrl); String domainName = url.getHost(); String domainNameASCII = IDNA.toASCII(domainName); if (!domainName.equals(domainNameASCII)) { // If the domain name changed, replace that in the // seed. seedASCII = seedUrl.replaceFirst(Pattern.quote(domainName), domainNameASCII); } } else { seedASCII = IDNA.toASCII(seedUrl); } if (!seedASCII.equals(seedUrl)) { log.trace("Converted {} to {}", seedUrl, seedASCII); // Note that duplicates is silently ignored seedListSet.add(seedASCII); } } catch (IDNAException e) { log.trace("Cannot convert seed {} to ASCII", seedUrl, e); } catch (MalformedURLException e) { log.trace("Cannot convert seed {} to ASCII", seedUrl, e); } } } orderXMLdoc.editOrderXMLAddPerDomainCrawlerTraps(cfg); // TODO update limits in settings files - see also bug 269 // Update estimates of job size long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); maxCountObjects = Math.max(expectation, maxCountObjects); minCountObjects = Math.min(expectation, minCountObjects); totalCountObjects += expectation; configsChanged = true; assert (maxCountObjects >= minCountObjects) : "basic invariant"; }