예제 #1
0
  /**
   * Package private constructor for common initialisation.
   *
   * @param harvestID the id of the harvestdefinition
   * @param cfg the configuration to base the Job on
   * @param orderXMLdoc
   * @param channel the channel on which the job will be submitted.
   * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain,
   *     overrides individual configuration settings. -1 means no limit
   * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for
   *     no limit.
   * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
   * @param harvestNum the run number of the harvest definition
   * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit <
   *     -1
   */
  public Job(
      Long harvestID,
      DomainConfiguration cfg,
      HeritrixTemplate orderXMLdoc,
      HarvestChannel channel,
      long forceMaxObjectsPerDomain,
      long forceMaxBytesPerDomain,
      long forceMaxJobRunningTime,
      int harvestNum)
      throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(cfg, "cfg");
    ArgumentNotValid.checkNotNull(harvestID, "harvestID");
    ArgumentNotValid.checkNotNegative(harvestID, "harvestID");
    ArgumentNotValid.checkNotNull(channel, "channel");

    if (forceMaxObjectsPerDomain < -1) {
      String msg = "forceMaxObjectsPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }
    if (forceMaxBytesPerDomain < -1) {
      String msg = "forceMaxBytesPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }

    if (forceMaxBytesPerDomain == 0L) {
      log.warn(
          "forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain");
    }

    if (forceMaxObjectsPerDomain == 0L) {
      log.warn(
          "forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain");
    }

    // setup initial members
    domainConfigurationMap = new HashMap<>();
    origHarvestDefinitionID = harvestID;
    orderXMLname = cfg.getOrderXmlName();
    this.orderXMLdoc = orderXMLdoc;

    setHarvestChannel(channel);

    long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects());
    setMaxObjectsPerDomain(maxObjects);
    configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain);

    long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes());
    setMaxBytesPerDomain(maxBytes);
    configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain);

    long expectation =
        cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
    maxCountObjects = expectation;
    minCountObjects = expectation;
    this.harvestNum = harvestNum;

    addConfiguration(cfg);

    setMaxJobRunningTime(forceMaxJobRunningTime);

    setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT));

    setAttributes(cfg.getAttributesAndTypes());

    orderXMLdoc.enableOrDisableDeduplication(
        Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED));

    status = JobStatus.NEW;
  }
예제 #2
0
/**
 * This class represents one job to run by Heritrix. It's based on a number of configurations all
 * based on the same order.xml and at most one configuration for each domain. Each job consists of
 * configurations of the approximate same size; that is the difference in expectation from the
 * smallest configuration to the largest configuration is within a factor of each other defined as
 * limMaxRelSize (although differences smaller than limMinAbsSize are ignored) There is a limit
 * limMaxTotalSize on the total size of the job in objects.
 *
 * <p>A job may also be limited on bytes or objects, defined either by the configurations in the job
 * or the harvest definition the job is generated by.
 *
 * <p>The job contains the order file, the seedlist and the current status of the job, as well as
 * the ID of the harvest definition that defined it and names of all the configurations it is based
 * on.
 */
@SuppressWarnings({"serial"})
public class Job implements Serializable, JobInfo {
  private static final transient Logger log = LoggerFactory.getLogger(Job.class);

  // Persistent fields stored in and read from DAO
  /** The persistent ID of this job. */
  private Long jobID;
  /** The Id of the harvestdefinition, that generated this job. */
  protected Long origHarvestDefinitionID;
  /** The status of the job. See the JobStatus class for the possible states. */
  protected JobStatus status;
  /** The name of the {@link HarvestChannel} on which this job will be posted. */
  private String channel;

  /** Whether the job belongs to a snapshot or partial harvest. */
  private boolean isSnapshot;
  /**
   * Overrides the individual configurations maximum setting for objects retrieved from a domain
   * when set to a positive value.
   */
  private long forceMaxObjectsPerDomain = Constants.HERITRIX_MAXOBJECTS_INFINITY;
  /**
   * Overrides the individual configurations maximum setting for bytes retrieved from a domain when
   * set to other than -1.
   */
  private long forceMaxBytesPerDomain = Constants.HERITRIX_MAXBYTES_INFINITY;
  /** The name of the harvest template used by the job. */
  private String orderXMLname;
  /** The harvest template used by the job. */
  private HeritrixTemplate orderXMLdoc;
  /** The list of Heritrix settings files. */
  private File[] settingsXMLfiles;

  /** The corresponding Dom4j Documents for these files. */
  // private Document[] settingsXMLdocs;

  /**
   * A set of seeds involved in this job. Outside the SetSeedList() method, the set of seeds is
   * updated in the addConfiguration() method.
   */
  private Set<String> seedListSet = new HashSet<String>();
  /** Which run of the harvest definition this is. */
  private int harvestNum;
  /** Errors during harvesting. */
  private String harvestErrors;
  /** Details about errors during harvesting. */
  private String harvestErrorDetails;
  /** Errors during upload of the harvested data. */
  private String uploadErrors;
  /** Details about errors during upload of the harvested data. */
  private String uploadErrorDetails;
  /** The starting point of the job. */
  private Date actualStart;
  /** The ending point of the job. */
  private Date actualStop;
  /** The time when this job was submitted. */
  private Date submittedDate;
  /** The time when this job was created. */
  private Date creationDate;

  /** Edition is used by the DAO to keep track of changes. */
  private long edition = -1;

  /** Resubmitted as the Job with this ID. If null, this job has not been resubmitted. */
  private Long resubmittedAsJobWithID;

  /** Continuation of this job. */
  private Long continuationOF;

  /**
   * A map (domainName, domainConfigurationName), must be accessible in order to update job
   * information (see Ass. 2.4.3)
   */
  private Map<String, String> domainConfigurationMap;
  /**
   * A hint to the DAO that configurations have changed. Since configurations are large, the DAO can
   * use that this is false to avoid updating the config list. The DAO can set it to false after
   * saving configurations.
   */
  boolean configsChanged = false;

  // Intermediate fields, non-persistent and only used while building objects

  /**
   * Whether the maxObjects field was defined by the harvest definition or the configuration limit.
   * This is deciding for whether we accept smaller configurations or not when building jobs. True
   * means the limit is defined by the configuration, false means that it is defined by the harvest
   * definition.
   */
  private boolean configurationSetsObjectLimit;

  /**
   * Whether the maxBytes field was defined by the harvest definition or the configuration limit.
   * This is deciding for whether we accept smaller configurations or not when building jobs. True
   * means the limit is defined by the configuration, false means by the harvest definition.
   */
  private boolean configurationSetsByteLimit;

  /** The lowest number of objects expected by a configuration. */
  private long minCountObjects;

  /** The highest number of objects expected by a configuration. */
  private long maxCountObjects;

  /** The total number of objects expected by all added configurations. */
  private long totalCountObjects;

  /** The max time in seconds given to the harvester for this job. 0 is unlimited. */
  private long forceMaxRunningTime;

  /**
   * If true, this job object is still undergoing changes due to having more configurations added.
   * When set to false, the object is no longer considered immutable except for updating status.
   *
   * <p>Jobs loaded from the DAO are never under construction anymore.
   */
  private boolean underConstruction = true;

  // Constants

  // Note: The following constants are intentionally left non-static for easy
  // unit testing

  private boolean maxObjectsIsSetByQuotaEnforcer =
      Settings.getBoolean(HarvesterSettings.OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER);

  /**
   * The harvestname prefix used in the files generated by Heritrix. Is set using an
   * ArchiveFileNaming class when the jobID is available.
   */
  private String harvestnamePrefix;

  /** This variable is right now the same as harvestdefinitions.audience field. */
  private String harvestAudience;

  protected Job() {
    this.status = JobStatus.NEW;
  }

  /**
   * Package private constructor for common initialisation.
   *
   * @param harvestID the id of the harvestdefinition
   * @param cfg the configuration to base the Job on
   * @param orderXMLdoc
   * @param channel the channel on which the job will be submitted.
   * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain,
   *     overrides individual configuration settings. -1 means no limit
   * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for
   *     no limit.
   * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
   * @param harvestNum the run number of the harvest definition
   * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit <
   *     -1
   */
  public Job(
      Long harvestID,
      DomainConfiguration cfg,
      HeritrixTemplate orderXMLdoc,
      HarvestChannel channel,
      long forceMaxObjectsPerDomain,
      long forceMaxBytesPerDomain,
      long forceMaxJobRunningTime,
      int harvestNum)
      throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(cfg, "cfg");
    ArgumentNotValid.checkNotNull(harvestID, "harvestID");
    ArgumentNotValid.checkNotNegative(harvestID, "harvestID");
    ArgumentNotValid.checkNotNull(channel, "channel");

    if (forceMaxObjectsPerDomain < -1) {
      String msg = "forceMaxObjectsPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }
    if (forceMaxBytesPerDomain < -1) {
      String msg = "forceMaxBytesPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }

    if (forceMaxBytesPerDomain == 0L) {
      log.warn(
          "forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain");
    }

    if (forceMaxObjectsPerDomain == 0L) {
      log.warn(
          "forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain");
    }

    // setup initial members
    domainConfigurationMap = new HashMap<>();
    origHarvestDefinitionID = harvestID;
    orderXMLname = cfg.getOrderXmlName();
    this.orderXMLdoc = orderXMLdoc;

    setHarvestChannel(channel);

    long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects());
    setMaxObjectsPerDomain(maxObjects);
    configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain);

    long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes());
    setMaxBytesPerDomain(maxBytes);
    configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain);

    long expectation =
        cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
    maxCountObjects = expectation;
    minCountObjects = expectation;
    this.harvestNum = harvestNum;

    addConfiguration(cfg);

    setMaxJobRunningTime(forceMaxJobRunningTime);

    setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT));

    setAttributes(cfg.getAttributesAndTypes());

    orderXMLdoc.enableOrDisableDeduplication(
        Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED));

    status = JobStatus.NEW;
  }

  public void setAttributes(List<AttributeAndType> attributesAndTypes) {
    orderXMLdoc.insertAttributes(attributesAndTypes);
  }

  /** Update the order template according to the chosen archive format (arc/warc). */
  private void setArchiveFormatInTemplate(String archiveFormat) {
    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }
    orderXMLdoc.setArchiveFormat(archiveFormat);
  }

  /**
   * Create a new Job object from basic information stored in the DAO.
   *
   * @param harvestID the id of the harvestdefinition
   * @param configurations the configurations to base the Job on
   * @param channel the name of the channel on which the job will be submitted.
   * @param snapshot whether the job belongs to a snapshot harvest
   * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain,
   *     overrides individual configuration settings. 0 means no limit.
   * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for
   *     no limit.
   * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
   * @param status the current status of the job.
   * @param orderXMLname the name of the order template used.
   * @param orderXMLdoc the (possibly modified) template
   * @param seedlist the combined seedlist from all configs.
   * @param harvestNum the run number of the harvest definition
   */
  Job(
      Long harvestID,
      Map<String, String> configurations,
      String channel,
      boolean snapshot,
      long forceMaxObjectsPerDomain,
      long forceMaxBytesPerDomain,
      long forceMaxJobRunningTime,
      JobStatus status,
      String orderXMLname,
      HeritrixTemplate orderXMLdoc,
      String seedlist,
      int harvestNum,
      Long continuationOf) {
    origHarvestDefinitionID = harvestID;
    domainConfigurationMap = configurations;
    this.channel = channel;
    this.isSnapshot = snapshot;
    this.forceMaxBytesPerDomain = forceMaxBytesPerDomain;
    this.forceMaxObjectsPerDomain = forceMaxObjectsPerDomain;
    this.forceMaxRunningTime = forceMaxJobRunningTime;
    this.status = status;
    this.orderXMLname = orderXMLname;
    this.orderXMLdoc = orderXMLdoc;
    this.setSeedList(seedlist);
    this.harvestNum = harvestNum;
    this.continuationOF = continuationOf;

    underConstruction = false;
  }

  /**
   * Adds a configuration to this Job. Seedlists and settings are updated accordingly.
   *
   * @param cfg the configuration to add
   * @throws ArgumentNotValid if cfg is null or cfg uses a different orderxml than this job or if
   *     this job already contains a configuration associated with domain of configuration cfg.
   */
  public void addConfiguration(DomainConfiguration cfg) {
    ArgumentNotValid.checkNotNull(cfg, "cfg");
    if (domainConfigurationMap.containsKey(cfg.getDomainName())) {
      throw new ArgumentNotValid(
          "Job already has a configuration for Domain " + cfg.getDomainName());
    }

    if (log.isTraceEnabled()) {
      log.trace("Adding configuration '{}' to job '{}'", cfg, cfg.getName());
    }

    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }

    if (!cfg.getOrderXmlName().equals(getOrderXMLName())) {
      throw new ArgumentNotValid(
          "Job requires the orderxml file:'"
              + getOrderXMLName()
              + "' not:'"
              + cfg.getOrderXmlName()
              + "' used by the configuration:'"
              + cfg.getName());
    }

    domainConfigurationMap.put(cfg.getDomainName(), cfg.getName());

    // Add the seeds from the configuration to the Job seeds.
    // Take care of duplicates.
    for (Iterator<SeedList> itt = cfg.getSeedLists(); itt.hasNext(); ) {
      SeedList seed = itt.next();
      List<String> seeds = seed.getSeeds();
      for (String seedUrl : seeds) {
        seedListSet.add(seedUrl); // duplicates is silently ignored

        // TODO remove when heritrix implements this functionality
        // try to convert a seed into a Internationalized Domain Name
        try {
          String seedASCII = seedUrl;
          // It is rare to see these seeds, but they need to be
          // correctly idnaized
          if (seedUrl.contains(":") || seedUrl.contains("/")) {
            String normalizedUrl = seedUrl;
            if (!normalizedUrl.matches("^[a-zA-Z]+:.*")) {
              // If no protocol is given, assume http
              normalizedUrl = "http://" + normalizedUrl;
            }
            URL url = new URL(normalizedUrl);
            String domainName = url.getHost();
            String domainNameASCII = IDNA.toASCII(domainName);
            if (!domainName.equals(domainNameASCII)) {
              // If the domain name changed, replace that in the
              // seed.
              seedASCII = seedUrl.replaceFirst(Pattern.quote(domainName), domainNameASCII);
            }
          } else {
            seedASCII = IDNA.toASCII(seedUrl);
          }
          if (!seedASCII.equals(seedUrl)) {
            log.trace("Converted {} to {}", seedUrl, seedASCII);
            // Note that duplicates is silently ignored
            seedListSet.add(seedASCII);
          }
        } catch (IDNAException e) {
          log.trace("Cannot convert seed {} to ASCII", seedUrl, e);
        } catch (MalformedURLException e) {
          log.trace("Cannot convert seed {} to ASCII", seedUrl, e);
        }
      }
    }

    orderXMLdoc.editOrderXMLAddPerDomainCrawlerTraps(cfg);

    // TODO update limits in settings files - see also bug 269

    // Update estimates of job size
    long expectation =
        cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
    maxCountObjects = Math.max(expectation, maxCountObjects);
    minCountObjects = Math.min(expectation, minCountObjects);
    totalCountObjects += expectation;

    configsChanged = true;

    assert (maxCountObjects >= minCountObjects) : "basic invariant";
  }

  /**
   * Get the name of the order XML file used by this Job.
   *
   * @return the name of the orderXML file
   */
  public String getOrderXMLName() {
    return orderXMLname;
  }

  /**
   * Get the actual time when this job was stopped/completed.
   *
   * @return the time as Date
   */
  public Date getActualStop() {
    return actualStop;
  }

  /**
   * Get the actual time when this job was started.
   *
   * @return the time as Date
   */
  public Date getActualStart() {
    return actualStart;
  }

  /**
   * Get the time when this job was submitted.
   *
   * @return the time as Date
   */
  public Date getSubmittedDate() {
    return submittedDate;
  }

  /**
   * Get the time when this job was created.
   *
   * @return the creation time as a <code>Date</code>
   */
  public Date getCreationDate() {
    return creationDate;
  }

  /**
   * Get a list of Heritrix settings.xml files. Note that these files have nothing to do with
   * NetarchiveSuite settings files. They are files that supplement the Heritrix order.xml files,
   * and contain overrides for specific domains.
   *
   * @return the list of Files as an array
   */
  public File[] getSettingsXMLfiles() {
    return settingsXMLfiles;
  }

  /**
   * Get the id of the HarvestDefinition from which this job originates.
   *
   * @return the id as a Long
   */
  public Long getOrigHarvestDefinitionID() {
    return origHarvestDefinitionID;
  }

  /**
   * Get the id of this Job.
   *
   * @return the id as a Long
   */
  public Long getJobID() {
    return jobID;
  }

  /**
   * Set the id of this Job.
   *
   * @param id The Id for this job.
   */
  public void setJobID(Long id) {
    jobID = id;
  }

  /**
   * Get's the total number of different domains harvested by this job.
   *
   * @return the number of configurations added to this domain
   */
  public int getCountDomains() {
    return domainConfigurationMap.size();
  }

  /**
   * Set the actual time when this job was started.
   *
   * <p>Sends a notification, if actualStart is set to a time after actualStop.
   *
   * @param actualStart A Date object representing the time when this job was started.
   */
  public void setActualStart(Date actualStart) {
    ArgumentNotValid.checkNotNull(actualStart, "actualStart");
    if (actualStop != null && actualStop.before(actualStart)) {
      log.warn(
          "Job("
              + getJobID()
              + "): Start time ("
              + actualStart
              + ") is after end time: "
              + actualStop);
    }
    this.actualStart = (Date) actualStart.clone();
  }

  /**
   * Set the actual time when this job was stopped/completed. Sends a notification, if actualStop is
   * set to a time before actualStart.
   *
   * @param actualStop A Date object representing the time when this job was stopped.
   * @throws ArgumentNotValid
   */
  public void setActualStop(Date actualStop) throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(actualStop, "actualStop");
    if (actualStart == null) {
      log.warn("Job(" + getJobID() + "): actualStart should be defined before setting actualStop");
    } else if (actualStop.before(actualStart)) {
      log.warn(
          "Job("
              + getJobID()
              + "): actualStop ("
              + actualStop
              + ") is before actualStart: "
              + actualStart);
    }
    this.actualStop = (Date) actualStop.clone();
  }

  /**
   * Set the orderxml for this job.
   *
   * @param doc A orderxml to be used by this job
   */
  public void setOrderXMLDoc(HeritrixTemplate doc) {
    ArgumentNotValid.checkNotNull(doc, "doc");
    this.orderXMLdoc = doc;
  }

  /**
   * Gets a document representation of the order.xml associated with this Job.
   *
   * @return the XML as a org.dom4j.Document
   */
  public HeritrixTemplate getOrderXMLdoc() {
    return orderXMLdoc;
  }

  //    /**
  //     * Gets a list of document representations of the settings.xml's associated with this Job.
  //     *
  //     * @return the XML as an array of org.dom4j.Document
  //     */
  //    public Document[] getSettingsXMLdocs() {
  //        return settingsXMLdocs;
  //    }

  /**
   * Set the seedlist of the job from the seedList argument. Individual seeds are separated by a
   * '\n' character. Duplicate seeds are removed.
   *
   * @param seedList List of seeds as one String
   */
  public void setSeedList(String seedList) {
    ArgumentNotValid.checkNotNullOrEmpty(seedList, "seedList");
    seedListSet = new HashSet<>();
    BufferedReader reader = new BufferedReader(new StringReader(seedList));
    String seed;
    try {
      while ((seed = reader.readLine()) != null) {
        seedListSet.add(seed); // add to seedlist if not already there
      }
    } catch (IOException e) {
      // This never happens, as we're reading from a string!
      throw new IOFailure("IOException reading from seed string", e);
    } finally {
      IOUtils.closeQuietly(reader);
    }
  }

  /**
   * Get the seedlist as a String. The individual seeds are separated by the character '\n'. The
   * order of the seeds are unknown.
   *
   * @return the seedlist as a String
   */
  public String getSeedListAsString() {
    return StringUtils.conjoin("\n", seedListSet);
  }

  /**
   * Get the current status of this Job.
   *
   * @return the status as an int in the range 0 to 4.
   */
  public JobStatus getStatus() {
    return status;
  }

  /**
   * Sets status of this job.
   *
   * @param newStatus Must be one of the values STATUS_NEW, ..., STATUS_FAILED
   * @throws ArgumentNotValid in case of invalid status argument or invalid status change
   */
  public void setStatus(JobStatus newStatus) {
    ArgumentNotValid.checkNotNull(newStatus, "newStatus");
    if (!status.legalChange(newStatus)) {
      final String message =
          "Status change from " + status + " to " + newStatus + " is not allowed";
      log.debug(message);
      throw new ArgumentNotValid(message);
    }

    if ((this.status == JobStatus.NEW || this.status == JobStatus.RESUBMITTED)
        && newStatus == JobStatus.SUBMITTED) {
      orderXMLdoc.configureQuotaEnforcer(
          maxObjectsIsSetByQuotaEnforcer, forceMaxBytesPerDomain, forceMaxObjectsPerDomain);
    }

    if (this.status == JobStatus.SUBMITTED && newStatus == JobStatus.STARTED) {
      setActualStart(new Date());
    }
    if (this.status == JobStatus.STARTED
        && (newStatus == JobStatus.DONE || newStatus == JobStatus.FAILED)) {
      setActualStop(new Date());
    }
    status = newStatus;
  }

  /**
   * Returns a map of domain names and name of their corresponding configuration.
   *
   * <p>The returned Map cannot be changed.
   *
   * @return a read-only Map (<String>, <String>)
   */
  public Map<String, String> getDomainConfigurationMap() {
    return Collections.unmodifiableMap(domainConfigurationMap);
  }

  /**
   * Gets the maximum number of objects harvested per domain.
   *
   * @return The maximum number of objects harvested per domain. 0 means no limit.
   */
  public long getMaxObjectsPerDomain() {
    return forceMaxObjectsPerDomain;
  }

  /**
   * Gets the maximum number of bytes harvested per domain.
   *
   * @return The maximum number of bytes harvested per domain. -1 means no limit.
   */
  public long getMaxBytesPerDomain() {
    return forceMaxBytesPerDomain;
  }

  /**
   * Get the edition number.
   *
   * @return The edition number
   */
  long getEdition() {
    return edition;
  }

  /**
   * Set the edition number.
   *
   * @param edition the new edition number
   */
  void setEdition(long edition) {
    this.edition = edition;
  }

  public void setHarvestChannel(HarvestChannel harvestChannel) {
    this.channel = harvestChannel.getName();
    this.isSnapshot = harvestChannel.isSnapshot();
  }

  /** @return the associated {@link HarvestChannel} name. */
  public String getChannel() {
    return channel;
  }

  /**
   * Sets the associated {@link HarvestChannel} name.
   *
   * @param channel the channel name
   */
  public void setChannel(String channel) {
    this.channel = channel;
  }

  /**
   * @return true if the job belongs to a snapshot harvest, false if it belongs to a focused
   *     harvest.
   */
  public boolean isSnapshot() {
    return isSnapshot;
  }

  /**
   * Sets whether job belongs to a snapshot or focused harvest.
   *
   * @param isSnapshot true if the job belongs to a snapshot harvest, false if it belongs to a
   *     focused harvest.
   */
  public void setSnapshot(boolean isSnapshot) {
    this.isSnapshot = isSnapshot;
  }

  @Override
  public String toString() {
    return "Job "
        + getJobID()
        + " (state = "
        + getStatus()
        + ", HD = "
        + getOrigHarvestDefinitionID()
        + ", channel = "
        + getChannel()
        + ", snapshot = "
        + isSnapshot()
        + ", forcemaxcount = "
        + getForceMaxObjectsPerDomain()
        + ", forcemaxbytes = "
        + getMaxBytesPerDomain()
        + ", forcemaxrunningtime = "
        + forceMaxRunningTime
        + ", orderxml = "
        + getOrderXMLName()
        + ", numconfigs = "
        + getDomainConfigurationMap().size()
        + ", created = "
        + getCreationDate()
        + (getSubmittedDate() != null ? ", submitted = " + getSubmittedDate() : "")
        + (getActualStart() != null ? ", started = " + getActualStart() : "")
        + (getActualStop() != null ? ", stopped = " + getActualStop() : "")
        + ")";
  }

  /** @return Returns the forceMaxObjectsPerDomain. 0 means no limit. */
  public long getForceMaxObjectsPerDomain() {
    return forceMaxObjectsPerDomain;
  }

  /**
   * Sets the maxObjectsPerDomain value.
   *
   * @param maxObjectsPerDomain The forceMaxObjectsPerDomain to set. 0 means no limit.
   * @throws IOFailure Thrown from auxiliary method editOrderXML_maxObjectsPerDomain.
   */
  protected void setMaxObjectsPerDomain(long maxObjectsPerDomain) {
    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }

    this.forceMaxObjectsPerDomain = maxObjectsPerDomain;
    orderXMLdoc.setMaxObjectsPerDomain(
        maxObjectsPerDomain); // FIXME? add argument to maxObjectsIsSetByQuotaEnforcer to method
    // setMaxObjectsPerDomain
    // orderXMLdoc.editOrderXML_maxObjectsPerDomain(orderXMLdoc, maxObjectsPerDomain,
    //        maxObjectsIsSetByQuotaEnforcer);

    if (0L == maxObjectsPerDomain && 0L != forceMaxBytesPerDomain) {
      setMaxBytesPerDomain(0L);
    }
  }

  /**
   * Set the maxbytes per domain value.
   *
   * @param maxBytesPerDomain The maxBytesPerDomain to set, or -1 for no limit.
   */
  protected void setMaxBytesPerDomain(long maxBytesPerDomain) {
    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }
    this.forceMaxBytesPerDomain = maxBytesPerDomain;
    orderXMLdoc.setMaxBytesPerDomain(maxBytesPerDomain);

    if (0L == maxBytesPerDomain && 0L != forceMaxObjectsPerDomain) {
      setMaxObjectsPerDomain(0L);
    }
  }

  /**
   * Set the maxJobRunningTime value.
   *
   * @param maxJobRunningTime The maxJobRunningTime in seconds to set, or 0 for no limit.
   */
  protected void setMaxJobRunningTime(long maxJobRunningTime) {
    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }
    this.forceMaxRunningTime = maxJobRunningTime;
    orderXMLdoc.setMaxJobRunningTime(maxJobRunningTime);
  }

  /** @return Returns the MaxJobRunningTime. 0 means no limit. */
  public long getMaxJobRunningTime() {
    return forceMaxRunningTime;
  }

  /**
   * Get the harvestNum for this job. The number reflects which run of the harvest definition this
   * is.
   *
   * @return the harvestNum for this job.
   */
  public int getHarvestNum() {
    return harvestNum;
  }

  /**
   * Set the harvestNum for this job. The number reflects which run of the harvest definition this
   * is. ONLY TO BE USED IN THE CONSTRUCTION PHASE.
   *
   * @param harvestNum a given harvestNum
   */
  public void setHarvestNum(int harvestNum) {
    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }
    this.harvestNum = harvestNum;
  }

  /**
   * Get the list of harvest errors for this job. If no harvest errors, null is returned This value
   * is not meaningful until the job is finished (FAILED,DONE, RESUBMITTED)
   *
   * @return the harvest errors for this job or null if no harvest errors.
   */
  public String getHarvestErrors() {
    return harvestErrors;
  }

  /**
   * Append to the list of harvest errors for this job. Nothing happens, if argument harvestErrors
   * is null.
   *
   * @param harvestErrors a string containing harvest errors (may be null)
   */
  public void appendHarvestErrors(String harvestErrors) {
    if (harvestErrors != null) {
      if (this.harvestErrors == null) {
        this.harvestErrors = harvestErrors;
      } else {
        this.harvestErrors += "\n" + harvestErrors;
      }
    }
  }

  /**
   * Get the list of harvest error details for this job. If no harvest error details, null is
   * returned This value is not meaningful until the job is finished (FAILED,DONE, RESUBMITTED)
   *
   * @return the list of harvest error details for this job or null if no harvest error details.
   */
  public String getHarvestErrorDetails() {
    return harvestErrorDetails;
  }

  /**
   * Append to the list of harvest error details for this job. Nothing happens, if argument
   * harvestErrorDetails is null.
   *
   * @param harvestErrorDetails a string containing harvest error details.
   */
  public void appendHarvestErrorDetails(String harvestErrorDetails) {
    if (harvestErrorDetails != null) {
      if (this.harvestErrorDetails == null) {
        this.harvestErrorDetails = harvestErrorDetails;
      } else {
        this.harvestErrorDetails += "\n" + harvestErrorDetails;
      }
    }
  }

  /**
   * Get the list of upload errors. If no upload errors, null is returned. This value is not
   * meaningful until the job is finished (FAILED,DONE, RESUBMITTED)
   *
   * @return the list of upload errors as String, or null if no upload errors.
   */
  public String getUploadErrors() {
    return uploadErrors;
  }

  /**
   * Append to the list of upload errors. Nothing happens, if argument uploadErrors is null.
   *
   * @param uploadErrors a string containing upload errors.
   */
  public void appendUploadErrors(String uploadErrors) {
    if (uploadErrors != null) {
      if (this.uploadErrors == null) {
        this.uploadErrors = uploadErrors;
      } else {
        this.uploadErrors += "\n" + uploadErrors;
      }
    }
  }

  /**
   * Get the list of upload error details. If no upload error details, null is returned. This value
   * is not meaningful until the job is finished (FAILED,DONE, RESUBMITTED)
   *
   * @return the list of upload error details as String, or null if no upload error details
   */
  public String getUploadErrorDetails() {
    return uploadErrorDetails;
  }

  /**
   * Append to the list of upload error details. Nothing happens, if argument uploadErrorDetails is
   * null.
   *
   * @param uploadErrorDetails a string containing upload error details.
   */
  public void appendUploadErrorDetails(String uploadErrorDetails) {
    if (uploadErrorDetails != null) {
      if (this.uploadErrorDetails == null) {
        this.uploadErrorDetails = uploadErrorDetails;
      } else {
        this.uploadErrorDetails += "\n" + uploadErrorDetails;
      }
    }
  }

  /**
   * Get the ID for the job which this job was resubmitted as. If null, this job has not been
   * resubmitted.
   *
   * @return this ID.
   */
  public Long getResubmittedAsJob() {
    return resubmittedAsJobWithID;
  }

  /**
   * Set the Date for when this job was submitted. If null, this job has not been submitted.
   *
   * @param submittedDate The date when this was submitted
   */
  public void setSubmittedDate(Date submittedDate) {
    this.submittedDate = submittedDate;
  }

  /**
   * Set the Date for when this job was created. If null, this job has not been created.
   *
   * @param creationDate The date when this was created
   */
  public void setCreationDate(Date creationDate) {
    this.creationDate = creationDate;
  }

  /**
   * Set the ID for the job which this job was resubmitted as.
   *
   * @param resubmittedAsJob An Id for a new job.
   */
  public void setResubmittedAsJob(Long resubmittedAsJob) {
    this.resubmittedAsJobWithID = resubmittedAsJob;
  }

  /**
   * @return id of the job that this job is supposed to continue using Heritrix recover-log or null
   *     if it starts from scratch.
   */
  public Long getContinuationOf() {
    return this.continuationOF;
  }

  @Override
  public String getHarvestFilenamePrefix() {
    if (this.harvestnamePrefix == null) {
      log.warn(
          "HarvestnamePrefix not yet set for job {}. Set it by using the naming scheme. "
              + "This should only happen for old jobs being read",
          this.jobID);
      setDefaultHarvestNamePrefix();
    }
    return this.harvestnamePrefix;
  }

  /** @param prefix */
  public void setHarvestFilenamePrefix(String prefix) {
    this.harvestnamePrefix = prefix;
  }

  /** @return the forceMaxBytesPerDomain */
  public long getForceMaxBytesPerDomain() {
    return forceMaxBytesPerDomain;
  }

  /** @return the configurationSetsObjectLimit */
  public boolean isConfigurationSetsObjectLimit() {
    return configurationSetsObjectLimit;
  }

  /** @return the configurationSetsByteLimit */
  public boolean isConfigurationSetsByteLimit() {
    return configurationSetsByteLimit;
  }

  /** @return the minCountObjects */
  public long getMinCountObjects() {
    return minCountObjects;
  }

  /** @return the maxCountObjects */
  public long getMaxCountObjects() {
    return maxCountObjects;
  }

  /** @return the totalCountObjects */
  public long getTotalCountObjects() {
    return totalCountObjects;
  }

  void setDefaultHarvestNamePrefix() {
    if (getJobID() != null) {
      ArchiveFileNaming naming = ArchiveFileNamingFactory.getInstance();
      log.debug("Applying the default ArchiveFileNaming class '{}'.", naming.getClass().getName());
      final String prefix = naming.getPrefix(this);
      setHarvestFilenamePrefix(prefix);
      log.debug("The harvestPrefix of this job is: {}", prefix);
    } else {
      log.warn(
          "The harvestnamePrefix is not set now, as it depends on the JobID, which is not set yet");
    }
  }

  /** @return the harvest-audience. */
  public String getHarvestAudience() {
    return harvestAudience;
  }

  /**
   * Set the harvest audience for this job. Taken from the harvestdefinition that generated this
   * job.
   *
   * @param theAudience the harvest-audience.
   */
  public void setHarvestAudience(String theAudience) {
    this.harvestAudience = theAudience;
  }

  ///////////// The following two methods are needed by harvestStatus-jobdetails.jsp
  // ////////////////////////////////////
  /**
   * Returns a list of sorted seeds for this job. The sorting is by domain, and inside each domain,
   * the list is sorted by url
   *
   * @return a list of sorted seeds for this job.
   */
  public List<String> getSortedSeedList() {
    Map<String, Set<String>> urlMap = new HashMap<String, Set<String>>();
    for (String seed : seedListSet) {
      String url;
      // Assume the protocol is http://, if it is missing
      if (!seed.matches(Constants.PROTOCOL_REGEXP)) {
        url = "http://" + seed;
      } else {
        url = seed;
      }
      String domain = getDomain(url);
      if (domain == null) {
        // stop processing this url, and continue to the next seed
        continue;
      }
      Set<String> set;
      if (urlMap.containsKey(domain)) {
        set = urlMap.get(domain);
      } else {
        set = new TreeSet<String>();
        urlMap.put(domain, set);
      }
      set.add(seed);
    }
    List<String> result = new ArrayList<String>();
    for (Set<String> set : urlMap.values()) {
      result.addAll(set);
    }
    return result;
  }
  /**
   * Get the domain, that the given URL belongs to.
   *
   * @param url an URL
   * @return the domain, that the given URL belongs to, or null if unable to do so.
   */
  private String getDomain(String url) {
    try {
      URL uri = new URL(url);
      return DomainUtils.domainNameFromHostname(uri.getHost());
    } catch (MalformedURLException e) {
      log.warn("The string '{}' is not a valid URL", url);
      return null;
    }
  }
}
/**
 * A base class for {@link JobGenerator} implementations. It is recommended to extend this class to
 * implement a new job generator.
 *
 * <p>The base algorithm iterates over domain configurations within the harvest definition, and
 * according to the configuration ({@link HarvesterSettings#JOBGEN_DOMAIN_CONFIG_SUBSET_SIZE},
 * constitutes a subset of domain configurations from which one or more jobs will be generated.
 */
abstract class AbstractJobGenerator implements JobGenerator {

  /** Logger for this class. */
  private static Log log = LogFactory.getLog(AbstractJobGenerator.class);

  /** How many domain configurations to process in one go. */
  private final long DOMAIN_CONFIG_SUBSET_SIZE =
      Settings.getLong(HarvesterSettings.JOBGEN_DOMAIN_CONFIG_SUBSET_SIZE);

  /** Is deduplication enabled or disabled. * */
  private final boolean DEDUPLICATION_ENABLED =
      Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED);

  @Override
  public int generateJobs(HarvestDefinition harvest) {
    log.info("Generating jobs for harvestdefinition # " + harvest.getOid());
    int jobsMade = 0;
    final Iterator<DomainConfiguration> domainConfigurations = harvest.getDomainConfigurations();

    while (domainConfigurations.hasNext()) {
      List<DomainConfiguration> subset = new ArrayList<DomainConfiguration>();
      while (domainConfigurations.hasNext() && subset.size() < DOMAIN_CONFIG_SUBSET_SIZE) {
        subset.add(domainConfigurations.next());
      }

      Collections.sort(subset, getDomainConfigurationSubsetComparator(harvest));
      if (log.isTraceEnabled()) {
        log.trace(
            subset.size()
                + " domainconfigs now sorted and ready to processing "
                + "for harvest #"
                + harvest.getOid());
      }
      jobsMade += processDomainConfigurationSubset(harvest, subset.iterator());
    }
    harvest.setNumEvents(harvest.getNumEvents() + 1);

    if (!harvest.isSnapShot()) {
      PartialHarvest focused = (PartialHarvest) harvest;
      Schedule schedule = focused.getSchedule();
      int numEvents = harvest.getNumEvents();

      // Calculate next event
      Date now = new Date();
      Date nextEvent = schedule.getNextEvent(focused.getNextDate(), numEvents);

      // Refuse to schedule event in the past
      if (nextEvent != null && nextEvent.before(now)) {
        int eventsSkipped = 0;
        while (nextEvent != null && nextEvent.before(now)) {
          nextEvent = schedule.getNextEvent(nextEvent, numEvents);
          eventsSkipped++;
        }
        if (log.isWarnEnabled()) {
          log.warn(
              "Refusing to schedule harvest definition '"
                  + harvest.getName()
                  + "' in the past. Skipped "
                  + eventsSkipped
                  + " events. Old nextDate was "
                  + focused.getNextDate()
                  + " new nextDate is "
                  + nextEvent);
        }
      }

      // Set next event
      focused.setNextDate(nextEvent);
      if (log.isTraceEnabled()) {
        log.trace(
            "Next event for harvest definition "
                + harvest.getName()
                + " happens: "
                + (nextEvent == null ? "Never" : nextEvent.toString()));
      }
    }

    log.info(
        "Finished generating " + jobsMade + " jobs for harvestdefinition # " + harvest.getOid());
    return jobsMade;
  }

  /**
   * Instantiates a new job.
   *
   * @param cfg the {@link DomainConfiguration} being processed
   * @param harvest the {@link HarvestDefinition} being processed
   * @return an instance of {@link Job}
   */
  public static Job getNewJob(HarvestDefinition harvest, DomainConfiguration cfg) {
    HarvestChannelDAO harvestChannelDao = HarvestChannelDAO.getInstance();
    HarvestChannel channel = harvestChannelDao.getChannelForHarvestDefinition(harvest.getOid());
    if (channel == null) {
      log.info(
          "No channel mapping registered for harvest id "
              + harvest.getOid()
              + ", will use default.");
      channel = harvestChannelDao.getDefaultChannel(harvest.isSnapShot());
    }
    if (harvest.isSnapShot()) {
      return Job.createSnapShotJob(
          harvest.getOid(),
          channel,
          cfg,
          harvest.getMaxCountObjects(),
          harvest.getMaxBytes(),
          ((FullHarvest) harvest).getMaxJobRunningTime(),
          harvest.getNumEvents());
    }
    return Job.createJob(harvest.getOid(), channel, cfg, harvest.getNumEvents());
  }

  /**
   * Returns a comparator used to sort the subset of {@link #DOMAIN_CONFIG_SUBSET_SIZE}
   * configurations that are scanned at each iteration.
   *
   * @param harvest the {@link HarvestDefinition} being processed.
   * @return a comparator
   */
  protected abstract Comparator<DomainConfiguration> getDomainConfigurationSubsetComparator(
      HarvestDefinition harvest);

  /**
   * Create new jobs from a collection of configurations. All configurations must use the same
   * order.xml file.Jobs
   *
   * @param harvest the {@link HarvestDefinition} being processed.
   * @param domainConfSubset the configurations to use to create the jobs
   * @return The number of jobs created
   * @throws ArgumentNotValid if any of the parameters is null or if the cfglist does not contain
   *     any configurations
   */
  protected abstract int processDomainConfigurationSubset(
      HarvestDefinition harvest, Iterator<DomainConfiguration> domainConfSubset);

  @Override
  public boolean canAccept(Job job, DomainConfiguration cfg) {
    if (!checkAddDomainConfInvariant(job, cfg)) {
      return false;
    }
    return checkSpecificAcceptConditions(job, cfg);
  }

  /**
   * Called by {@link #canAccept(Job, DomainConfiguration)}. Tests the implementation-specific
   * conditions to accept the given {@link DomainConfiguration} in the given {@link Job}. It is
   * assumed that {@link #checkAddDomainConfInvariant(Job, DomainConfiguration)} has already passed.
   *
   * @param job the {@link Job} n=being built
   * @param cfg the {@link DomainConfiguration} to test
   * @return true if the configuration passes the conditions.
   */
  protected abstract boolean checkSpecificAcceptConditions(Job job, DomainConfiguration cfg);

  /**
   * Once the job has been filled with {@link DomainConfiguration}s, performs the following
   * operations:
   *
   * <ol>
   *   <li>Edit the harvest template to add/remove deduplicator configuration.
   *   <li>
   * </ol>
   *
   * @param job the job
   */
  protected void editJobOrderXml(Job job) {
    Document doc = job.getOrderXMLdoc();
    if (DEDUPLICATION_ENABLED) {
      // Check that the Deduplicator element is present in the
      // OrderXMl and enabled. If missing or disabled log a warning
      if (!HeritrixTemplate.isDeduplicationEnabledInTemplate(doc)) {
        if (log.isWarnEnabled()) {
          log.warn(
              "Unable to perform deduplication for this job"
                  + " as the required DeDuplicator element is "
                  + "disabled or missing from template");
        }
      }
    } else {
      // Remove deduplicator Element from OrderXML if present
      Node xpathNode = doc.selectSingleNode(HeritrixTemplate.DEDUPLICATOR_XPATH);
      if (xpathNode != null) {
        xpathNode.detach();
        job.setOrderXMLDoc(doc);
        if (log.isInfoEnabled()) {
          log.info("Removed DeDuplicator element because " + "Deduplication is disabled");
        }
      }
    }
  }

  /**
   * Tests that:
   *
   * <ol>
   *   <li>The given domain configuration and job are not null.
   *   <li>The job does not already contain the given domain configuration.
   *   <li>The domain configuration has the same order xml name as the first inserted domain config.
   * </ol>
   *
   * @param job a given Job
   * @param cfg a given DomainConfiguration
   * @return true, if the given DomainConfiguration can be inserted into the given job
   */
  private boolean checkAddDomainConfInvariant(Job job, DomainConfiguration cfg) {
    ArgumentNotValid.checkNotNull(job, "job");
    ArgumentNotValid.checkNotNull(cfg, "cfg");

    // check if domain in DomainConfiguration cfg is not already in this job
    // domainName is used as key in domainConfigurationMap
    if (job.getDomainConfigurationMap().containsKey(cfg.getDomainName())) {
      if (log.isDebugEnabled()) {
        log.debug("Job already has a configuration for Domain '" + cfg.getDomainName() + "'.");
      }
      return false;
    }

    // check if template is same as this job.
    String orderXMLname = job.getOrderXMLName();
    if (!orderXMLname.equals(cfg.getOrderXmlName())) {
      if (log.isDebugEnabled()) {
        log.debug(
            "This Job only accept configurations "
                + "using the harvest template '"
                + orderXMLname
                + "'. This configuration uses the harvest template '"
                + cfg.getOrderXmlName()
                + "'.");
      }
      return false;
    }

    return true;
  }
}