/** * Package private constructor for common initialisation. * * @param harvestID the id of the harvestdefinition * @param cfg the configuration to base the Job on * @param orderXMLdoc * @param channel the channel on which the job will be submitted. * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, * overrides individual configuration settings. -1 means no limit * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for * no limit. * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job * @param harvestNum the run number of the harvest definition * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit < * -1 */ public Job( Long harvestID, DomainConfiguration cfg, HeritrixTemplate orderXMLdoc, HarvestChannel channel, long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime, int harvestNum) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(cfg, "cfg"); ArgumentNotValid.checkNotNull(harvestID, "harvestID"); ArgumentNotValid.checkNotNegative(harvestID, "harvestID"); ArgumentNotValid.checkNotNull(channel, "channel"); if (forceMaxObjectsPerDomain < -1) { String msg = "forceMaxObjectsPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain < -1) { String msg = "forceMaxBytesPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain == 0L) { log.warn( "forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain"); } if (forceMaxObjectsPerDomain == 0L) { log.warn( "forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain"); } // setup initial members domainConfigurationMap = new HashMap<>(); origHarvestDefinitionID = harvestID; orderXMLname = cfg.getOrderXmlName(); this.orderXMLdoc = orderXMLdoc; setHarvestChannel(channel); long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects()); setMaxObjectsPerDomain(maxObjects); configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain); long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes()); setMaxBytesPerDomain(maxBytes); configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain); long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); maxCountObjects = expectation; minCountObjects = expectation; this.harvestNum = harvestNum; addConfiguration(cfg); setMaxJobRunningTime(forceMaxJobRunningTime); setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT)); setAttributes(cfg.getAttributesAndTypes()); orderXMLdoc.enableOrDisableDeduplication( Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED)); status = JobStatus.NEW; }
/** * This class represents one job to run by Heritrix. It's based on a number of configurations all * based on the same order.xml and at most one configuration for each domain. Each job consists of * configurations of the approximate same size; that is the difference in expectation from the * smallest configuration to the largest configuration is within a factor of each other defined as * limMaxRelSize (although differences smaller than limMinAbsSize are ignored) There is a limit * limMaxTotalSize on the total size of the job in objects. * * <p>A job may also be limited on bytes or objects, defined either by the configurations in the job * or the harvest definition the job is generated by. * * <p>The job contains the order file, the seedlist and the current status of the job, as well as * the ID of the harvest definition that defined it and names of all the configurations it is based * on. */ @SuppressWarnings({"serial"}) public class Job implements Serializable, JobInfo { private static final transient Logger log = LoggerFactory.getLogger(Job.class); // Persistent fields stored in and read from DAO /** The persistent ID of this job. */ private Long jobID; /** The Id of the harvestdefinition, that generated this job. */ protected Long origHarvestDefinitionID; /** The status of the job. See the JobStatus class for the possible states. */ protected JobStatus status; /** The name of the {@link HarvestChannel} on which this job will be posted. */ private String channel; /** Whether the job belongs to a snapshot or partial harvest. */ private boolean isSnapshot; /** * Overrides the individual configurations maximum setting for objects retrieved from a domain * when set to a positive value. */ private long forceMaxObjectsPerDomain = Constants.HERITRIX_MAXOBJECTS_INFINITY; /** * Overrides the individual configurations maximum setting for bytes retrieved from a domain when * set to other than -1. */ private long forceMaxBytesPerDomain = Constants.HERITRIX_MAXBYTES_INFINITY; /** The name of the harvest template used by the job. */ private String orderXMLname; /** The harvest template used by the job. */ private HeritrixTemplate orderXMLdoc; /** The list of Heritrix settings files. */ private File[] settingsXMLfiles; /** The corresponding Dom4j Documents for these files. */ // private Document[] settingsXMLdocs; /** * A set of seeds involved in this job. Outside the SetSeedList() method, the set of seeds is * updated in the addConfiguration() method. */ private Set<String> seedListSet = new HashSet<String>(); /** Which run of the harvest definition this is. */ private int harvestNum; /** Errors during harvesting. */ private String harvestErrors; /** Details about errors during harvesting. */ private String harvestErrorDetails; /** Errors during upload of the harvested data. */ private String uploadErrors; /** Details about errors during upload of the harvested data. */ private String uploadErrorDetails; /** The starting point of the job. */ private Date actualStart; /** The ending point of the job. */ private Date actualStop; /** The time when this job was submitted. */ private Date submittedDate; /** The time when this job was created. */ private Date creationDate; /** Edition is used by the DAO to keep track of changes. */ private long edition = -1; /** Resubmitted as the Job with this ID. If null, this job has not been resubmitted. */ private Long resubmittedAsJobWithID; /** Continuation of this job. */ private Long continuationOF; /** * A map (domainName, domainConfigurationName), must be accessible in order to update job * information (see Ass. 2.4.3) */ private Map<String, String> domainConfigurationMap; /** * A hint to the DAO that configurations have changed. Since configurations are large, the DAO can * use that this is false to avoid updating the config list. The DAO can set it to false after * saving configurations. */ boolean configsChanged = false; // Intermediate fields, non-persistent and only used while building objects /** * Whether the maxObjects field was defined by the harvest definition or the configuration limit. * This is deciding for whether we accept smaller configurations or not when building jobs. True * means the limit is defined by the configuration, false means that it is defined by the harvest * definition. */ private boolean configurationSetsObjectLimit; /** * Whether the maxBytes field was defined by the harvest definition or the configuration limit. * This is deciding for whether we accept smaller configurations or not when building jobs. True * means the limit is defined by the configuration, false means by the harvest definition. */ private boolean configurationSetsByteLimit; /** The lowest number of objects expected by a configuration. */ private long minCountObjects; /** The highest number of objects expected by a configuration. */ private long maxCountObjects; /** The total number of objects expected by all added configurations. */ private long totalCountObjects; /** The max time in seconds given to the harvester for this job. 0 is unlimited. */ private long forceMaxRunningTime; /** * If true, this job object is still undergoing changes due to having more configurations added. * When set to false, the object is no longer considered immutable except for updating status. * * <p>Jobs loaded from the DAO are never under construction anymore. */ private boolean underConstruction = true; // Constants // Note: The following constants are intentionally left non-static for easy // unit testing private boolean maxObjectsIsSetByQuotaEnforcer = Settings.getBoolean(HarvesterSettings.OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER); /** * The harvestname prefix used in the files generated by Heritrix. Is set using an * ArchiveFileNaming class when the jobID is available. */ private String harvestnamePrefix; /** This variable is right now the same as harvestdefinitions.audience field. */ private String harvestAudience; protected Job() { this.status = JobStatus.NEW; } /** * Package private constructor for common initialisation. * * @param harvestID the id of the harvestdefinition * @param cfg the configuration to base the Job on * @param orderXMLdoc * @param channel the channel on which the job will be submitted. * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, * overrides individual configuration settings. -1 means no limit * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for * no limit. * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job * @param harvestNum the run number of the harvest definition * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit < * -1 */ public Job( Long harvestID, DomainConfiguration cfg, HeritrixTemplate orderXMLdoc, HarvestChannel channel, long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime, int harvestNum) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(cfg, "cfg"); ArgumentNotValid.checkNotNull(harvestID, "harvestID"); ArgumentNotValid.checkNotNegative(harvestID, "harvestID"); ArgumentNotValid.checkNotNull(channel, "channel"); if (forceMaxObjectsPerDomain < -1) { String msg = "forceMaxObjectsPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain < -1) { String msg = "forceMaxBytesPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain == 0L) { log.warn( "forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain"); } if (forceMaxObjectsPerDomain == 0L) { log.warn( "forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain"); } // setup initial members domainConfigurationMap = new HashMap<>(); origHarvestDefinitionID = harvestID; orderXMLname = cfg.getOrderXmlName(); this.orderXMLdoc = orderXMLdoc; setHarvestChannel(channel); long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects()); setMaxObjectsPerDomain(maxObjects); configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain); long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes()); setMaxBytesPerDomain(maxBytes); configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain); long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); maxCountObjects = expectation; minCountObjects = expectation; this.harvestNum = harvestNum; addConfiguration(cfg); setMaxJobRunningTime(forceMaxJobRunningTime); setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT)); setAttributes(cfg.getAttributesAndTypes()); orderXMLdoc.enableOrDisableDeduplication( Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED)); status = JobStatus.NEW; } public void setAttributes(List<AttributeAndType> attributesAndTypes) { orderXMLdoc.insertAttributes(attributesAndTypes); } /** Update the order template according to the chosen archive format (arc/warc). */ private void setArchiveFormatInTemplate(String archiveFormat) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } orderXMLdoc.setArchiveFormat(archiveFormat); } /** * Create a new Job object from basic information stored in the DAO. * * @param harvestID the id of the harvestdefinition * @param configurations the configurations to base the Job on * @param channel the name of the channel on which the job will be submitted. * @param snapshot whether the job belongs to a snapshot harvest * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, * overrides individual configuration settings. 0 means no limit. * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for * no limit. * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job * @param status the current status of the job. * @param orderXMLname the name of the order template used. * @param orderXMLdoc the (possibly modified) template * @param seedlist the combined seedlist from all configs. * @param harvestNum the run number of the harvest definition */ Job( Long harvestID, Map<String, String> configurations, String channel, boolean snapshot, long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime, JobStatus status, String orderXMLname, HeritrixTemplate orderXMLdoc, String seedlist, int harvestNum, Long continuationOf) { origHarvestDefinitionID = harvestID; domainConfigurationMap = configurations; this.channel = channel; this.isSnapshot = snapshot; this.forceMaxBytesPerDomain = forceMaxBytesPerDomain; this.forceMaxObjectsPerDomain = forceMaxObjectsPerDomain; this.forceMaxRunningTime = forceMaxJobRunningTime; this.status = status; this.orderXMLname = orderXMLname; this.orderXMLdoc = orderXMLdoc; this.setSeedList(seedlist); this.harvestNum = harvestNum; this.continuationOF = continuationOf; underConstruction = false; } /** * Adds a configuration to this Job. Seedlists and settings are updated accordingly. * * @param cfg the configuration to add * @throws ArgumentNotValid if cfg is null or cfg uses a different orderxml than this job or if * this job already contains a configuration associated with domain of configuration cfg. */ public void addConfiguration(DomainConfiguration cfg) { ArgumentNotValid.checkNotNull(cfg, "cfg"); if (domainConfigurationMap.containsKey(cfg.getDomainName())) { throw new ArgumentNotValid( "Job already has a configuration for Domain " + cfg.getDomainName()); } if (log.isTraceEnabled()) { log.trace("Adding configuration '{}' to job '{}'", cfg, cfg.getName()); } if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } if (!cfg.getOrderXmlName().equals(getOrderXMLName())) { throw new ArgumentNotValid( "Job requires the orderxml file:'" + getOrderXMLName() + "' not:'" + cfg.getOrderXmlName() + "' used by the configuration:'" + cfg.getName()); } domainConfigurationMap.put(cfg.getDomainName(), cfg.getName()); // Add the seeds from the configuration to the Job seeds. // Take care of duplicates. for (Iterator<SeedList> itt = cfg.getSeedLists(); itt.hasNext(); ) { SeedList seed = itt.next(); List<String> seeds = seed.getSeeds(); for (String seedUrl : seeds) { seedListSet.add(seedUrl); // duplicates is silently ignored // TODO remove when heritrix implements this functionality // try to convert a seed into a Internationalized Domain Name try { String seedASCII = seedUrl; // It is rare to see these seeds, but they need to be // correctly idnaized if (seedUrl.contains(":") || seedUrl.contains("/")) { String normalizedUrl = seedUrl; if (!normalizedUrl.matches("^[a-zA-Z]+:.*")) { // If no protocol is given, assume http normalizedUrl = "http://" + normalizedUrl; } URL url = new URL(normalizedUrl); String domainName = url.getHost(); String domainNameASCII = IDNA.toASCII(domainName); if (!domainName.equals(domainNameASCII)) { // If the domain name changed, replace that in the // seed. seedASCII = seedUrl.replaceFirst(Pattern.quote(domainName), domainNameASCII); } } else { seedASCII = IDNA.toASCII(seedUrl); } if (!seedASCII.equals(seedUrl)) { log.trace("Converted {} to {}", seedUrl, seedASCII); // Note that duplicates is silently ignored seedListSet.add(seedASCII); } } catch (IDNAException e) { log.trace("Cannot convert seed {} to ASCII", seedUrl, e); } catch (MalformedURLException e) { log.trace("Cannot convert seed {} to ASCII", seedUrl, e); } } } orderXMLdoc.editOrderXMLAddPerDomainCrawlerTraps(cfg); // TODO update limits in settings files - see also bug 269 // Update estimates of job size long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); maxCountObjects = Math.max(expectation, maxCountObjects); minCountObjects = Math.min(expectation, minCountObjects); totalCountObjects += expectation; configsChanged = true; assert (maxCountObjects >= minCountObjects) : "basic invariant"; } /** * Get the name of the order XML file used by this Job. * * @return the name of the orderXML file */ public String getOrderXMLName() { return orderXMLname; } /** * Get the actual time when this job was stopped/completed. * * @return the time as Date */ public Date getActualStop() { return actualStop; } /** * Get the actual time when this job was started. * * @return the time as Date */ public Date getActualStart() { return actualStart; } /** * Get the time when this job was submitted. * * @return the time as Date */ public Date getSubmittedDate() { return submittedDate; } /** * Get the time when this job was created. * * @return the creation time as a <code>Date</code> */ public Date getCreationDate() { return creationDate; } /** * Get a list of Heritrix settings.xml files. Note that these files have nothing to do with * NetarchiveSuite settings files. They are files that supplement the Heritrix order.xml files, * and contain overrides for specific domains. * * @return the list of Files as an array */ public File[] getSettingsXMLfiles() { return settingsXMLfiles; } /** * Get the id of the HarvestDefinition from which this job originates. * * @return the id as a Long */ public Long getOrigHarvestDefinitionID() { return origHarvestDefinitionID; } /** * Get the id of this Job. * * @return the id as a Long */ public Long getJobID() { return jobID; } /** * Set the id of this Job. * * @param id The Id for this job. */ public void setJobID(Long id) { jobID = id; } /** * Get's the total number of different domains harvested by this job. * * @return the number of configurations added to this domain */ public int getCountDomains() { return domainConfigurationMap.size(); } /** * Set the actual time when this job was started. * * <p>Sends a notification, if actualStart is set to a time after actualStop. * * @param actualStart A Date object representing the time when this job was started. */ public void setActualStart(Date actualStart) { ArgumentNotValid.checkNotNull(actualStart, "actualStart"); if (actualStop != null && actualStop.before(actualStart)) { log.warn( "Job(" + getJobID() + "): Start time (" + actualStart + ") is after end time: " + actualStop); } this.actualStart = (Date) actualStart.clone(); } /** * Set the actual time when this job was stopped/completed. Sends a notification, if actualStop is * set to a time before actualStart. * * @param actualStop A Date object representing the time when this job was stopped. * @throws ArgumentNotValid */ public void setActualStop(Date actualStop) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(actualStop, "actualStop"); if (actualStart == null) { log.warn("Job(" + getJobID() + "): actualStart should be defined before setting actualStop"); } else if (actualStop.before(actualStart)) { log.warn( "Job(" + getJobID() + "): actualStop (" + actualStop + ") is before actualStart: " + actualStart); } this.actualStop = (Date) actualStop.clone(); } /** * Set the orderxml for this job. * * @param doc A orderxml to be used by this job */ public void setOrderXMLDoc(HeritrixTemplate doc) { ArgumentNotValid.checkNotNull(doc, "doc"); this.orderXMLdoc = doc; } /** * Gets a document representation of the order.xml associated with this Job. * * @return the XML as a org.dom4j.Document */ public HeritrixTemplate getOrderXMLdoc() { return orderXMLdoc; } // /** // * Gets a list of document representations of the settings.xml's associated with this Job. // * // * @return the XML as an array of org.dom4j.Document // */ // public Document[] getSettingsXMLdocs() { // return settingsXMLdocs; // } /** * Set the seedlist of the job from the seedList argument. Individual seeds are separated by a * '\n' character. Duplicate seeds are removed. * * @param seedList List of seeds as one String */ public void setSeedList(String seedList) { ArgumentNotValid.checkNotNullOrEmpty(seedList, "seedList"); seedListSet = new HashSet<>(); BufferedReader reader = new BufferedReader(new StringReader(seedList)); String seed; try { while ((seed = reader.readLine()) != null) { seedListSet.add(seed); // add to seedlist if not already there } } catch (IOException e) { // This never happens, as we're reading from a string! throw new IOFailure("IOException reading from seed string", e); } finally { IOUtils.closeQuietly(reader); } } /** * Get the seedlist as a String. The individual seeds are separated by the character '\n'. The * order of the seeds are unknown. * * @return the seedlist as a String */ public String getSeedListAsString() { return StringUtils.conjoin("\n", seedListSet); } /** * Get the current status of this Job. * * @return the status as an int in the range 0 to 4. */ public JobStatus getStatus() { return status; } /** * Sets status of this job. * * @param newStatus Must be one of the values STATUS_NEW, ..., STATUS_FAILED * @throws ArgumentNotValid in case of invalid status argument or invalid status change */ public void setStatus(JobStatus newStatus) { ArgumentNotValid.checkNotNull(newStatus, "newStatus"); if (!status.legalChange(newStatus)) { final String message = "Status change from " + status + " to " + newStatus + " is not allowed"; log.debug(message); throw new ArgumentNotValid(message); } if ((this.status == JobStatus.NEW || this.status == JobStatus.RESUBMITTED) && newStatus == JobStatus.SUBMITTED) { orderXMLdoc.configureQuotaEnforcer( maxObjectsIsSetByQuotaEnforcer, forceMaxBytesPerDomain, forceMaxObjectsPerDomain); } if (this.status == JobStatus.SUBMITTED && newStatus == JobStatus.STARTED) { setActualStart(new Date()); } if (this.status == JobStatus.STARTED && (newStatus == JobStatus.DONE || newStatus == JobStatus.FAILED)) { setActualStop(new Date()); } status = newStatus; } /** * Returns a map of domain names and name of their corresponding configuration. * * <p>The returned Map cannot be changed. * * @return a read-only Map (<String>, <String>) */ public Map<String, String> getDomainConfigurationMap() { return Collections.unmodifiableMap(domainConfigurationMap); } /** * Gets the maximum number of objects harvested per domain. * * @return The maximum number of objects harvested per domain. 0 means no limit. */ public long getMaxObjectsPerDomain() { return forceMaxObjectsPerDomain; } /** * Gets the maximum number of bytes harvested per domain. * * @return The maximum number of bytes harvested per domain. -1 means no limit. */ public long getMaxBytesPerDomain() { return forceMaxBytesPerDomain; } /** * Get the edition number. * * @return The edition number */ long getEdition() { return edition; } /** * Set the edition number. * * @param edition the new edition number */ void setEdition(long edition) { this.edition = edition; } public void setHarvestChannel(HarvestChannel harvestChannel) { this.channel = harvestChannel.getName(); this.isSnapshot = harvestChannel.isSnapshot(); } /** @return the associated {@link HarvestChannel} name. */ public String getChannel() { return channel; } /** * Sets the associated {@link HarvestChannel} name. * * @param channel the channel name */ public void setChannel(String channel) { this.channel = channel; } /** * @return true if the job belongs to a snapshot harvest, false if it belongs to a focused * harvest. */ public boolean isSnapshot() { return isSnapshot; } /** * Sets whether job belongs to a snapshot or focused harvest. * * @param isSnapshot true if the job belongs to a snapshot harvest, false if it belongs to a * focused harvest. */ public void setSnapshot(boolean isSnapshot) { this.isSnapshot = isSnapshot; } @Override public String toString() { return "Job " + getJobID() + " (state = " + getStatus() + ", HD = " + getOrigHarvestDefinitionID() + ", channel = " + getChannel() + ", snapshot = " + isSnapshot() + ", forcemaxcount = " + getForceMaxObjectsPerDomain() + ", forcemaxbytes = " + getMaxBytesPerDomain() + ", forcemaxrunningtime = " + forceMaxRunningTime + ", orderxml = " + getOrderXMLName() + ", numconfigs = " + getDomainConfigurationMap().size() + ", created = " + getCreationDate() + (getSubmittedDate() != null ? ", submitted = " + getSubmittedDate() : "") + (getActualStart() != null ? ", started = " + getActualStart() : "") + (getActualStop() != null ? ", stopped = " + getActualStop() : "") + ")"; } /** @return Returns the forceMaxObjectsPerDomain. 0 means no limit. */ public long getForceMaxObjectsPerDomain() { return forceMaxObjectsPerDomain; } /** * Sets the maxObjectsPerDomain value. * * @param maxObjectsPerDomain The forceMaxObjectsPerDomain to set. 0 means no limit. * @throws IOFailure Thrown from auxiliary method editOrderXML_maxObjectsPerDomain. */ protected void setMaxObjectsPerDomain(long maxObjectsPerDomain) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } this.forceMaxObjectsPerDomain = maxObjectsPerDomain; orderXMLdoc.setMaxObjectsPerDomain( maxObjectsPerDomain); // FIXME? add argument to maxObjectsIsSetByQuotaEnforcer to method // setMaxObjectsPerDomain // orderXMLdoc.editOrderXML_maxObjectsPerDomain(orderXMLdoc, maxObjectsPerDomain, // maxObjectsIsSetByQuotaEnforcer); if (0L == maxObjectsPerDomain && 0L != forceMaxBytesPerDomain) { setMaxBytesPerDomain(0L); } } /** * Set the maxbytes per domain value. * * @param maxBytesPerDomain The maxBytesPerDomain to set, or -1 for no limit. */ protected void setMaxBytesPerDomain(long maxBytesPerDomain) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } this.forceMaxBytesPerDomain = maxBytesPerDomain; orderXMLdoc.setMaxBytesPerDomain(maxBytesPerDomain); if (0L == maxBytesPerDomain && 0L != forceMaxObjectsPerDomain) { setMaxObjectsPerDomain(0L); } } /** * Set the maxJobRunningTime value. * * @param maxJobRunningTime The maxJobRunningTime in seconds to set, or 0 for no limit. */ protected void setMaxJobRunningTime(long maxJobRunningTime) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } this.forceMaxRunningTime = maxJobRunningTime; orderXMLdoc.setMaxJobRunningTime(maxJobRunningTime); } /** @return Returns the MaxJobRunningTime. 0 means no limit. */ public long getMaxJobRunningTime() { return forceMaxRunningTime; } /** * Get the harvestNum for this job. The number reflects which run of the harvest definition this * is. * * @return the harvestNum for this job. */ public int getHarvestNum() { return harvestNum; } /** * Set the harvestNum for this job. The number reflects which run of the harvest definition this * is. ONLY TO BE USED IN THE CONSTRUCTION PHASE. * * @param harvestNum a given harvestNum */ public void setHarvestNum(int harvestNum) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } this.harvestNum = harvestNum; } /** * Get the list of harvest errors for this job. If no harvest errors, null is returned This value * is not meaningful until the job is finished (FAILED,DONE, RESUBMITTED) * * @return the harvest errors for this job or null if no harvest errors. */ public String getHarvestErrors() { return harvestErrors; } /** * Append to the list of harvest errors for this job. Nothing happens, if argument harvestErrors * is null. * * @param harvestErrors a string containing harvest errors (may be null) */ public void appendHarvestErrors(String harvestErrors) { if (harvestErrors != null) { if (this.harvestErrors == null) { this.harvestErrors = harvestErrors; } else { this.harvestErrors += "\n" + harvestErrors; } } } /** * Get the list of harvest error details for this job. If no harvest error details, null is * returned This value is not meaningful until the job is finished (FAILED,DONE, RESUBMITTED) * * @return the list of harvest error details for this job or null if no harvest error details. */ public String getHarvestErrorDetails() { return harvestErrorDetails; } /** * Append to the list of harvest error details for this job. Nothing happens, if argument * harvestErrorDetails is null. * * @param harvestErrorDetails a string containing harvest error details. */ public void appendHarvestErrorDetails(String harvestErrorDetails) { if (harvestErrorDetails != null) { if (this.harvestErrorDetails == null) { this.harvestErrorDetails = harvestErrorDetails; } else { this.harvestErrorDetails += "\n" + harvestErrorDetails; } } } /** * Get the list of upload errors. If no upload errors, null is returned. This value is not * meaningful until the job is finished (FAILED,DONE, RESUBMITTED) * * @return the list of upload errors as String, or null if no upload errors. */ public String getUploadErrors() { return uploadErrors; } /** * Append to the list of upload errors. Nothing happens, if argument uploadErrors is null. * * @param uploadErrors a string containing upload errors. */ public void appendUploadErrors(String uploadErrors) { if (uploadErrors != null) { if (this.uploadErrors == null) { this.uploadErrors = uploadErrors; } else { this.uploadErrors += "\n" + uploadErrors; } } } /** * Get the list of upload error details. If no upload error details, null is returned. This value * is not meaningful until the job is finished (FAILED,DONE, RESUBMITTED) * * @return the list of upload error details as String, or null if no upload error details */ public String getUploadErrorDetails() { return uploadErrorDetails; } /** * Append to the list of upload error details. Nothing happens, if argument uploadErrorDetails is * null. * * @param uploadErrorDetails a string containing upload error details. */ public void appendUploadErrorDetails(String uploadErrorDetails) { if (uploadErrorDetails != null) { if (this.uploadErrorDetails == null) { this.uploadErrorDetails = uploadErrorDetails; } else { this.uploadErrorDetails += "\n" + uploadErrorDetails; } } } /** * Get the ID for the job which this job was resubmitted as. If null, this job has not been * resubmitted. * * @return this ID. */ public Long getResubmittedAsJob() { return resubmittedAsJobWithID; } /** * Set the Date for when this job was submitted. If null, this job has not been submitted. * * @param submittedDate The date when this was submitted */ public void setSubmittedDate(Date submittedDate) { this.submittedDate = submittedDate; } /** * Set the Date for when this job was created. If null, this job has not been created. * * @param creationDate The date when this was created */ public void setCreationDate(Date creationDate) { this.creationDate = creationDate; } /** * Set the ID for the job which this job was resubmitted as. * * @param resubmittedAsJob An Id for a new job. */ public void setResubmittedAsJob(Long resubmittedAsJob) { this.resubmittedAsJobWithID = resubmittedAsJob; } /** * @return id of the job that this job is supposed to continue using Heritrix recover-log or null * if it starts from scratch. */ public Long getContinuationOf() { return this.continuationOF; } @Override public String getHarvestFilenamePrefix() { if (this.harvestnamePrefix == null) { log.warn( "HarvestnamePrefix not yet set for job {}. Set it by using the naming scheme. " + "This should only happen for old jobs being read", this.jobID); setDefaultHarvestNamePrefix(); } return this.harvestnamePrefix; } /** @param prefix */ public void setHarvestFilenamePrefix(String prefix) { this.harvestnamePrefix = prefix; } /** @return the forceMaxBytesPerDomain */ public long getForceMaxBytesPerDomain() { return forceMaxBytesPerDomain; } /** @return the configurationSetsObjectLimit */ public boolean isConfigurationSetsObjectLimit() { return configurationSetsObjectLimit; } /** @return the configurationSetsByteLimit */ public boolean isConfigurationSetsByteLimit() { return configurationSetsByteLimit; } /** @return the minCountObjects */ public long getMinCountObjects() { return minCountObjects; } /** @return the maxCountObjects */ public long getMaxCountObjects() { return maxCountObjects; } /** @return the totalCountObjects */ public long getTotalCountObjects() { return totalCountObjects; } void setDefaultHarvestNamePrefix() { if (getJobID() != null) { ArchiveFileNaming naming = ArchiveFileNamingFactory.getInstance(); log.debug("Applying the default ArchiveFileNaming class '{}'.", naming.getClass().getName()); final String prefix = naming.getPrefix(this); setHarvestFilenamePrefix(prefix); log.debug("The harvestPrefix of this job is: {}", prefix); } else { log.warn( "The harvestnamePrefix is not set now, as it depends on the JobID, which is not set yet"); } } /** @return the harvest-audience. */ public String getHarvestAudience() { return harvestAudience; } /** * Set the harvest audience for this job. Taken from the harvestdefinition that generated this * job. * * @param theAudience the harvest-audience. */ public void setHarvestAudience(String theAudience) { this.harvestAudience = theAudience; } ///////////// The following two methods are needed by harvestStatus-jobdetails.jsp // //////////////////////////////////// /** * Returns a list of sorted seeds for this job. The sorting is by domain, and inside each domain, * the list is sorted by url * * @return a list of sorted seeds for this job. */ public List<String> getSortedSeedList() { Map<String, Set<String>> urlMap = new HashMap<String, Set<String>>(); for (String seed : seedListSet) { String url; // Assume the protocol is http://, if it is missing if (!seed.matches(Constants.PROTOCOL_REGEXP)) { url = "http://" + seed; } else { url = seed; } String domain = getDomain(url); if (domain == null) { // stop processing this url, and continue to the next seed continue; } Set<String> set; if (urlMap.containsKey(domain)) { set = urlMap.get(domain); } else { set = new TreeSet<String>(); urlMap.put(domain, set); } set.add(seed); } List<String> result = new ArrayList<String>(); for (Set<String> set : urlMap.values()) { result.addAll(set); } return result; } /** * Get the domain, that the given URL belongs to. * * @param url an URL * @return the domain, that the given URL belongs to, or null if unable to do so. */ private String getDomain(String url) { try { URL uri = new URL(url); return DomainUtils.domainNameFromHostname(uri.getHost()); } catch (MalformedURLException e) { log.warn("The string '{}' is not a valid URL", url); return null; } } }
/** * A base class for {@link JobGenerator} implementations. It is recommended to extend this class to * implement a new job generator. * * <p>The base algorithm iterates over domain configurations within the harvest definition, and * according to the configuration ({@link HarvesterSettings#JOBGEN_DOMAIN_CONFIG_SUBSET_SIZE}, * constitutes a subset of domain configurations from which one or more jobs will be generated. */ abstract class AbstractJobGenerator implements JobGenerator { /** Logger for this class. */ private static Log log = LogFactory.getLog(AbstractJobGenerator.class); /** How many domain configurations to process in one go. */ private final long DOMAIN_CONFIG_SUBSET_SIZE = Settings.getLong(HarvesterSettings.JOBGEN_DOMAIN_CONFIG_SUBSET_SIZE); /** Is deduplication enabled or disabled. * */ private final boolean DEDUPLICATION_ENABLED = Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED); @Override public int generateJobs(HarvestDefinition harvest) { log.info("Generating jobs for harvestdefinition # " + harvest.getOid()); int jobsMade = 0; final Iterator<DomainConfiguration> domainConfigurations = harvest.getDomainConfigurations(); while (domainConfigurations.hasNext()) { List<DomainConfiguration> subset = new ArrayList<DomainConfiguration>(); while (domainConfigurations.hasNext() && subset.size() < DOMAIN_CONFIG_SUBSET_SIZE) { subset.add(domainConfigurations.next()); } Collections.sort(subset, getDomainConfigurationSubsetComparator(harvest)); if (log.isTraceEnabled()) { log.trace( subset.size() + " domainconfigs now sorted and ready to processing " + "for harvest #" + harvest.getOid()); } jobsMade += processDomainConfigurationSubset(harvest, subset.iterator()); } harvest.setNumEvents(harvest.getNumEvents() + 1); if (!harvest.isSnapShot()) { PartialHarvest focused = (PartialHarvest) harvest; Schedule schedule = focused.getSchedule(); int numEvents = harvest.getNumEvents(); // Calculate next event Date now = new Date(); Date nextEvent = schedule.getNextEvent(focused.getNextDate(), numEvents); // Refuse to schedule event in the past if (nextEvent != null && nextEvent.before(now)) { int eventsSkipped = 0; while (nextEvent != null && nextEvent.before(now)) { nextEvent = schedule.getNextEvent(nextEvent, numEvents); eventsSkipped++; } if (log.isWarnEnabled()) { log.warn( "Refusing to schedule harvest definition '" + harvest.getName() + "' in the past. Skipped " + eventsSkipped + " events. Old nextDate was " + focused.getNextDate() + " new nextDate is " + nextEvent); } } // Set next event focused.setNextDate(nextEvent); if (log.isTraceEnabled()) { log.trace( "Next event for harvest definition " + harvest.getName() + " happens: " + (nextEvent == null ? "Never" : nextEvent.toString())); } } log.info( "Finished generating " + jobsMade + " jobs for harvestdefinition # " + harvest.getOid()); return jobsMade; } /** * Instantiates a new job. * * @param cfg the {@link DomainConfiguration} being processed * @param harvest the {@link HarvestDefinition} being processed * @return an instance of {@link Job} */ public static Job getNewJob(HarvestDefinition harvest, DomainConfiguration cfg) { HarvestChannelDAO harvestChannelDao = HarvestChannelDAO.getInstance(); HarvestChannel channel = harvestChannelDao.getChannelForHarvestDefinition(harvest.getOid()); if (channel == null) { log.info( "No channel mapping registered for harvest id " + harvest.getOid() + ", will use default."); channel = harvestChannelDao.getDefaultChannel(harvest.isSnapShot()); } if (harvest.isSnapShot()) { return Job.createSnapShotJob( harvest.getOid(), channel, cfg, harvest.getMaxCountObjects(), harvest.getMaxBytes(), ((FullHarvest) harvest).getMaxJobRunningTime(), harvest.getNumEvents()); } return Job.createJob(harvest.getOid(), channel, cfg, harvest.getNumEvents()); } /** * Returns a comparator used to sort the subset of {@link #DOMAIN_CONFIG_SUBSET_SIZE} * configurations that are scanned at each iteration. * * @param harvest the {@link HarvestDefinition} being processed. * @return a comparator */ protected abstract Comparator<DomainConfiguration> getDomainConfigurationSubsetComparator( HarvestDefinition harvest); /** * Create new jobs from a collection of configurations. All configurations must use the same * order.xml file.Jobs * * @param harvest the {@link HarvestDefinition} being processed. * @param domainConfSubset the configurations to use to create the jobs * @return The number of jobs created * @throws ArgumentNotValid if any of the parameters is null or if the cfglist does not contain * any configurations */ protected abstract int processDomainConfigurationSubset( HarvestDefinition harvest, Iterator<DomainConfiguration> domainConfSubset); @Override public boolean canAccept(Job job, DomainConfiguration cfg) { if (!checkAddDomainConfInvariant(job, cfg)) { return false; } return checkSpecificAcceptConditions(job, cfg); } /** * Called by {@link #canAccept(Job, DomainConfiguration)}. Tests the implementation-specific * conditions to accept the given {@link DomainConfiguration} in the given {@link Job}. It is * assumed that {@link #checkAddDomainConfInvariant(Job, DomainConfiguration)} has already passed. * * @param job the {@link Job} n=being built * @param cfg the {@link DomainConfiguration} to test * @return true if the configuration passes the conditions. */ protected abstract boolean checkSpecificAcceptConditions(Job job, DomainConfiguration cfg); /** * Once the job has been filled with {@link DomainConfiguration}s, performs the following * operations: * * <ol> * <li>Edit the harvest template to add/remove deduplicator configuration. * <li> * </ol> * * @param job the job */ protected void editJobOrderXml(Job job) { Document doc = job.getOrderXMLdoc(); if (DEDUPLICATION_ENABLED) { // Check that the Deduplicator element is present in the // OrderXMl and enabled. If missing or disabled log a warning if (!HeritrixTemplate.isDeduplicationEnabledInTemplate(doc)) { if (log.isWarnEnabled()) { log.warn( "Unable to perform deduplication for this job" + " as the required DeDuplicator element is " + "disabled or missing from template"); } } } else { // Remove deduplicator Element from OrderXML if present Node xpathNode = doc.selectSingleNode(HeritrixTemplate.DEDUPLICATOR_XPATH); if (xpathNode != null) { xpathNode.detach(); job.setOrderXMLDoc(doc); if (log.isInfoEnabled()) { log.info("Removed DeDuplicator element because " + "Deduplication is disabled"); } } } } /** * Tests that: * * <ol> * <li>The given domain configuration and job are not null. * <li>The job does not already contain the given domain configuration. * <li>The domain configuration has the same order xml name as the first inserted domain config. * </ol> * * @param job a given Job * @param cfg a given DomainConfiguration * @return true, if the given DomainConfiguration can be inserted into the given job */ private boolean checkAddDomainConfInvariant(Job job, DomainConfiguration cfg) { ArgumentNotValid.checkNotNull(job, "job"); ArgumentNotValid.checkNotNull(cfg, "cfg"); // check if domain in DomainConfiguration cfg is not already in this job // domainName is used as key in domainConfigurationMap if (job.getDomainConfigurationMap().containsKey(cfg.getDomainName())) { if (log.isDebugEnabled()) { log.debug("Job already has a configuration for Domain '" + cfg.getDomainName() + "'."); } return false; } // check if template is same as this job. String orderXMLname = job.getOrderXMLName(); if (!orderXMLname.equals(cfg.getOrderXmlName())) { if (log.isDebugEnabled()) { log.debug( "This Job only accept configurations " + "using the harvest template '" + orderXMLname + "'. This configuration uses the harvest template '" + cfg.getOrderXmlName() + "'."); } return false; } return true; } }