Java Settings.getBoolean Examples

Programming Language: Java

Namespace/Package Name: dk.netarkivet.common.utils

Class/Type: Settings

Method/Function: getBoolean

Examples at hotexamples.com: 3

Java Settings.getBoolean - 3 examples found. These are the top rated real world Java examples of dk.netarkivet.common.utils.Settings.getBoolean extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

set(9)

get(4)

getBoolean(3)

getLong(2)

getInt(1)

Example #1

Show file

File: Job.java Project: netarchivesuite/netarchivesuite

  /**
   * Package private constructor for common initialisation.
   *
   * @param harvestID the id of the harvestdefinition
   * @param cfg the configuration to base the Job on
   * @param orderXMLdoc
   * @param channel the channel on which the job will be submitted.
   * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain,
   *     overrides individual configuration settings. -1 means no limit
   * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for
   *     no limit.
   * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
   * @param harvestNum the run number of the harvest definition
   * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit <
   *     -1
   */
  public Job(
      Long harvestID,
      DomainConfiguration cfg,
      HeritrixTemplate orderXMLdoc,
      HarvestChannel channel,
      long forceMaxObjectsPerDomain,
      long forceMaxBytesPerDomain,
      long forceMaxJobRunningTime,
      int harvestNum)
      throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(cfg, "cfg");
    ArgumentNotValid.checkNotNull(harvestID, "harvestID");
    ArgumentNotValid.checkNotNegative(harvestID, "harvestID");
    ArgumentNotValid.checkNotNull(channel, "channel");

    if (forceMaxObjectsPerDomain < -1) {
      String msg = "forceMaxObjectsPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }
    if (forceMaxBytesPerDomain < -1) {
      String msg = "forceMaxBytesPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }

    if (forceMaxBytesPerDomain == 0L) {
      log.warn(
          "forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain");
    }

    if (forceMaxObjectsPerDomain == 0L) {
      log.warn(
          "forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain");
    }

    // setup initial members
    domainConfigurationMap = new HashMap<>();
    origHarvestDefinitionID = harvestID;
    orderXMLname = cfg.getOrderXmlName();
    this.orderXMLdoc = orderXMLdoc;

    setHarvestChannel(channel);

    long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects());
    setMaxObjectsPerDomain(maxObjects);
    configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain);

    long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes());
    setMaxBytesPerDomain(maxBytes);
    configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain);

    long expectation =
        cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
    maxCountObjects = expectation;
    minCountObjects = expectation;
    this.harvestNum = harvestNum;

    addConfiguration(cfg);

    setMaxJobRunningTime(forceMaxJobRunningTime);

    setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT));

    setAttributes(cfg.getAttributesAndTypes());

    orderXMLdoc.enableOrDisableDeduplication(
        Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED));

    status = JobStatus.NEW;
  }

Example #2

Show file

File: Job.java Project: netarchivesuite/netarchivesuite

/**
 * This class represents one job to run by Heritrix. It's based on a number of configurations all
 * based on the same order.xml and at most one configuration for each domain. Each job consists of
 * configurations of the approximate same size; that is the difference in expectation from the
 * smallest configuration to the largest configuration is within a factor of each other defined as
 * limMaxRelSize (although differences smaller than limMinAbsSize are ignored) There is a limit
 * limMaxTotalSize on the total size of the job in objects.
 *
 * <p>A job may also be limited on bytes or objects, defined either by the configurations in the job
 * or the harvest definition the job is generated by.
 *
 * <p>The job contains the order file, the seedlist and the current status of the job, as well as
 * the ID of the harvest definition that defined it and names of all the configurations it is based
 * on.
 */
@SuppressWarnings({"serial"})
public class Job implements Serializable, JobInfo {
  private static final transient Logger log = LoggerFactory.getLogger(Job.class);

  // Persistent fields stored in and read from DAO
  /** The persistent ID of this job. */
  private Long jobID;
  /** The Id of the harvestdefinition, that generated this job. */
  protected Long origHarvestDefinitionID;
  /** The status of the job. See the JobStatus class for the possible states. */
  protected JobStatus status;
  /** The name of the {@link HarvestChannel} on which this job will be posted. */
  private String channel;

  /** Whether the job belongs to a snapshot or partial harvest. */
  private boolean isSnapshot;
  /**
   * Overrides the individual configurations maximum setting for objects retrieved from a domain
   * when set to a positive value.
   */
  private long forceMaxObjectsPerDomain = Constants.HERITRIX_MAXOBJECTS_INFINITY;
  /**
   * Overrides the individual configurations maximum setting for bytes retrieved from a domain when
   * set to other than -1.
   */
  private long forceMaxBytesPerDomain = Constants.HERITRIX_MAXBYTES_INFINITY;
  /** The name of the harvest template used by the job. */
  private String orderXMLname;
  /** The harvest template used by the job. */
  private HeritrixTemplate orderXMLdoc;
  /** The list of Heritrix settings files. */
  private File[] settingsXMLfiles;

  /** The corresponding Dom4j Documents for these files. */
  // private Document[] settingsXMLdocs;

  /**
   * A set of seeds involved in this job. Outside the SetSeedList() method, the set of seeds is
   * updated in the addConfiguration() method.
   */
  private Set<String> seedListSet = new HashSet<String>();
  /** Which run of the harvest definition this is. */
  private int harvestNum;
  /** Errors during harvesting. */
  private String harvestErrors;
  /** Details about errors during harvesting. */
  private String harvestErrorDetails;
  /** Errors during upload of the harvested data. */
  private String uploadErrors;
  /** Details about errors during upload of the harvested data. */
  private String uploadErrorDetails;
  /** The starting point of the job. */
  private Date actualStart;
  /** The ending point of the job. */
  private Date actualStop;
  /** The time when this job was submitted. */
  private Date submittedDate;
  /** The time when this job was created. */
  private Date creationDate;

  /** Edition is used by the DAO to keep track of changes. */
  private long edition = -1;

  /** Resubmitted as the Job with this ID. If null, this job has not been resubmitted. */
  private Long resubmittedAsJobWithID;

  /** Continuation of this job. */
  private Long continuationOF;

  /**
   * A map (domainName, domainConfigurationName), must be accessible in order to update job
   * information (see Ass. 2.4.3)
   */
  private Map<String, String> domainConfigurationMap;
  /**
   * A hint to the DAO that configurations have changed. Since configurations are large, the DAO can
   * use that this is false to avoid updating the config list. The DAO can set it to false after
   * saving configurations.
   */
  boolean configsChanged = false;

  // Intermediate fields, non-persistent and only used while building objects

  /**
   * Whether the maxObjects field was defined by the harvest definition or the configuration limit.
   * This is deciding for whether we accept smaller configurations or not when building jobs. True
   * means the limit is defined by the configuration, false means that it is defined by the harvest
   * definition.
   */
  private boolean configurationSetsObjectLimit;

  /**
   * Whether the maxBytes field was defined by the harvest definition or the configuration limit.
   * This is deciding for whether we accept smaller configurations or not when building jobs. True
   * means the limit is defined by the configuration, false means by the harvest definition.
   */
  private boolean configurationSetsByteLimit;

  /** The lowest number of objects expected by a configuration. */
  private long minCountObjects;

  /** The highest number of objects expected by a configuration. */
  private long maxCountObjects;

  /** The total number of objects expected by all added configurations. */
  private long totalCountObjects;

  /** The max time in seconds given to the harvester for this job. 0 is unlimited. */
  private long forceMaxRunningTime;

  /**
   * If true, this job object is still undergoing changes due to having more configurations added.
   * When set to false, the object is no longer considered immutable except for updating status.
   *
   * <p>Jobs loaded from the DAO are never under construction anymore.
   */
  private boolean underConstruction = true;

  // Constants

  // Note: The following constants are intentionally left non-static for easy
  // unit testing

  private boolean maxObjectsIsSetByQuotaEnforcer =
      Settings.getBoolean(HarvesterSettings.OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER);

  /**
   * The harvestname prefix used in the files generated by Heritrix. Is set using an
   * ArchiveFileNaming class when the jobID is available.
   */
  private String harvestnamePrefix;

  /** This variable is right now the same as harvestdefinitions.audience field. */
  private String harvestAudience;

  protected Job() {
    this.status = JobStatus.NEW;
  }

  /**
   * Package private constructor for common initialisation.
   *
   * @param harvestID the id of the harvestdefinition
   * @param cfg the configuration to base the Job on
   * @param orderXMLdoc
   * @param channel the channel on which the job will be submitted.
   * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain,
   *     overrides individual configuration settings. -1 means no limit
   * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for
   *     no limit.
   * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
   * @param harvestNum the run number of the harvest definition
   * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit <
   *     -1
   */
  public Job(
      Long harvestID,
      DomainConfiguration cfg,
      HeritrixTemplate orderXMLdoc,
      HarvestChannel channel,
      long forceMaxObjectsPerDomain,
      long forceMaxBytesPerDomain,
      long forceMaxJobRunningTime,
      int harvestNum)
      throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(cfg, "cfg");
    ArgumentNotValid.checkNotNull(harvestID, "harvestID");
    ArgumentNotValid.checkNotNegative(harvestID, "harvestID");
    ArgumentNotValid.checkNotNull(channel, "channel");

    if (forceMaxObjectsPerDomain < -1) {
      String msg = "forceMaxObjectsPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }
    if (forceMaxBytesPerDomain < -1) {
      String msg = "forceMaxBytesPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }

    if (forceMaxBytesPerDomain == 0L) {
      log.warn(
          "forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain");
    }

    if (forceMaxObjectsPerDomain == 0L) {
      log.warn(
          "forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain");
    }

    // setup initial members
    domainConfigurationMap = new HashMap<>();
    origHarvestDefinitionID = harvestID;
    orderXMLname = cfg.getOrderXmlName();
    this.orderXMLdoc = orderXMLdoc;

    setHarvestChannel(channel);

    long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects());
    setMaxObjectsPerDomain(maxObjects);
    configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain);

    long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes());
    setMaxBytesPerDomain(maxBytes);
    configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain);

    long expectation =
        cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
    maxCountObjects = expectation;
    minCountObjects = expectation;
    this.harvestNum = harvestNum;

    addConfiguration(cfg);

    setMaxJobRunningTime(forceMaxJobRunningTime);

    setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT));

    setAttributes(cfg.getAttributesAndTypes());

    orderXMLdoc.enableOrDisableDeduplication(
        Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED));

    status = JobStatus.NEW;
  }

  public void setAttributes(List<AttributeAndType> attributesAndTypes) {
    orderXMLdoc.insertAttributes(attributesAndTypes);
  }

  /** Update the order template according to the chosen archive format (arc/warc). */
  private void setArchiveFormatInTemplate(String archiveFormat) {
    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }
    orderXMLdoc.setArchiveFormat(archiveFormat);
  }

  /**
   * Create a new Job object from basic information stored in the DAO.
   *
   * @param harvestID the id of the harvestdefinition
   * @param configurations the configurations to base the Job on
   * @param channel the name of the channel on which the job will be submitted.
   * @param snapshot whether the job belongs to a snapshot harvest
   * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain,
   *     overrides individual configuration settings. 0 means no limit.
   * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for
   *     no limit.
   * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
   * @param status the current status of the job.
   * @param orderXMLname the name of the order template used.
   * @param orderXMLdoc the (possibly modified) template
   * @param seedlist the combined seedlist from all configs.
   * @param harvestNum the run number of the harvest definition
   */
  Job(
      Long harvestID,
      Map<String, String> configurations,
      String channel,
      boolean snapshot,
      long forceMaxObjectsPerDomain,
      long forceMaxBytesPerDomain,
      long forceMaxJobRunningTime,
      JobStatus status,
      String orderXMLname,
      HeritrixTemplate orderXMLdoc,
      String seedlist,
      int harvestNum,
      Long continuationOf) {
    origHarvestDefinitionID = harvestID;
    domainConfigurationMap = configurations;
    this.channel = channel;
    this.isSnapshot = snapshot;
    this.forceMaxBytesPerDomain = forceMaxBytesPerDomain;
    this.forceMaxObjectsPerDomain = forceMaxObjectsPerDomain;
    this.forceMaxRunningTime = forceMaxJobRunningTime;
    this.status = status;
    this.orderXMLname = orderXMLname;
    this.orderXMLdoc = orderXMLdoc;
    this.setSeedList(seedlist);
    this.harvestNum = harvestNum;
    this.continuationOF = continuationOf;

    underConstruction = false;
  }

  /**
   * Adds a configuration to this Job. Seedlists and settings are updated accordingly.
   *
   * @param cfg the configuration to add
   * @throws ArgumentNotValid if cfg is null or cfg uses a different orderxml than this job or if
   *     this job already contains a configuration associated with domain of configuration cfg.
   */
  public void addConfiguration(DomainConfiguration cfg) {
    ArgumentNotValid.checkNotNull(cfg, "cfg");
    if (domainConfigurationMap.containsKey(cfg.getDomainName())) {
      throw new ArgumentNotValid(
          "Job already has a configuration for Domain " + cfg.getDomainName());
    }

    if (log.isTraceEnabled()) {
      log.trace("Adding configuration '{}' to job '{}'", cfg, cfg.getName());
    }

    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }

    if (!cfg.getOrderXmlName().equals(getOrderXMLName())) {
      throw new ArgumentNotValid(
          "Job requires the orderxml file:'"
              + getOrderXMLName()
              + "' not:'"
              + cfg.getOrderXmlName()
              + "' used by the configuration:'"
              + cfg.getName());
    }

    domainConfigurationMap.put(cfg.getDomainName(), cfg.getName());

    // Add the seeds from the configuration to the Job seeds.
    // Take care of duplicates.
    for (Iterator<SeedList> itt = cfg.getSeedLists(); itt.hasNext(); ) {
      SeedList seed = itt.next();
      List<String> seeds = seed.getSeeds();
      for (String seedUrl : seeds) {
        seedListSet.add(seedUrl); // duplicates is silently ignored

        // TODO remove when heritrix implements this functionality
        // try to convert a seed into a Internationalized Domain Name
        try {
          String seedASCII = seedUrl;
          // It is rare to see these seeds, but they need to be
          // correctly idnaized
          if (seedUrl.contains(":") || seedUrl.contains("/")) {
            String normalizedUrl = seedUrl;
            if (!normalizedUrl.matches("^[a-zA-Z]+:.*")) {
              // If no protocol is given, assume http
              normalizedUrl = "http://" + normalizedUrl;
            }
            URL url = new URL(normalizedUrl);
            String domainName = url.getHost();
            String domainNameASCII = IDNA.toASCII(domainName);
            if (!domainName.equals(domainNameASCII)) {
              // If the domain name changed, replace that in the
              // seed.
              seedASCII = seedUrl.replaceFirst(Pattern.quote(domainName), domainNameASCII);
            }
          } else {
            seedASCII = IDNA.toASCII(seedUrl);
          }
          if (!seedASCII.equals(seedUrl)) {
            log.trace("Converted {} to {}", seedUrl, seedASCII);
            // Note that duplicates is silently ignored
            seedListSet.add(seedASCII);
          }
        } catch (IDNAException e) {
          log.trace("Cannot convert seed {} to ASCII", seedUrl, e);
        } catch (MalformedURLException e) {
          log.trace("Cannot convert seed {} to ASCII", seedUrl, e);
        }
      }
    }

    orderXMLdoc.editOrderXMLAddPerDomainCrawlerTraps(cfg);

    // TODO update limits in settings files - see also bug 269

    // Update estimates of job size
    long expectation =
        cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
    maxCountObjects = Math.max(expectation, maxCountObjects);
    minCountObjects = Math.min(expectation, minCountObjects);
    totalCountObjects += expectation;

    configsChanged = true;

    assert (maxCountObjects >= minCountObjects) : "basic invariant";
  }

  /**
   * Get the name of the order XML file used by this Job.
   *
   * @return the name of the orderXML file
   */
  public String getOrderXMLName() {
    return orderXMLname;
  }

  /**
   * Get the actual time when this job was stopped/completed.
   *
   * @return the time as Date
   */
  public Date getActualStop() {
    return actualStop;
  }

  /**
   * Get the actual time when this job was started.
   *
   * @return the time as Date
   */
  public Date getActualStart() {
    return actualStart;
  }

  /**
   * Get the time when this job was submitted.
   *
   * @return the time as Date
   */
  public Date getSubmittedDate() {
    return submittedDate;
  }

  /**
   * Get the time when this job was created.
   *
   * @return the creation time as a <code>Date</code>
   */
  public Date getCreationDate() {
    return creationDate;
  }

  /**
   * Get a list of Heritrix settings.xml files. Note that these files have nothing to do with
   * NetarchiveSuite settings files. They are files that supplement the Heritrix order.xml files,
   * and contain overrides for specific domains.
   *
   * @return the list of Files as an array
   */
  public File[] getSettingsXMLfiles() {
    return settingsXMLfiles;
  }

  /**
   * Get the id of the HarvestDefinition from which this job originates.
   *
   * @return the id as a Long
   */
  public Long getOrigHarvestDefinitionID() {
    return origHarvestDefinitionID;
  }

  /**
   * Get the id of this Job.
   *
   * @return the id as a Long
   */
  public Long getJobID() {
    return jobID;
  }

  /**
   * Set the id of this Job.
   *
   * @param id The Id for this job.
   */
  public void setJobID(Long id) {
    jobID = id;
  }

  /**
   * Get's the total number of different domains harvested by this job.
   *
   * @return the number of configurations added to this domain
   */
  public int getCountDomains() {
    return domainConfigurationMap.size();
  }

  /**
   * Set the actual time when this job was started.
   *
   * <p>Sends a notification, if actualStart is set to a time after actualStop.
   *
   * @param actualStart A Date object representing the time when this job was started.
   */
  public void setActualStart(Date actualStart) {
    ArgumentNotValid.checkNotNull(actualStart, "actualStart");
    if (actualStop != null && actualStop.before(actualStart)) {
      log.warn(
          "Job("
              + getJobID()
              + "): Start time ("
              + actualStart
              + ") is after end time: "
              + actualStop);
    }
    this.actualStart = (Date) actualStart.clone();
  }

  /**
   * Set the actual time when this job was stopped/completed. Sends a notification, if actualStop is
   * set to a time before actualStart.
   *
   * @param actualStop A Date object representing the time when this job was stopped.
   * @throws ArgumentNotValid
   */
  public void setActualStop(Date actualStop) throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(actualStop, "actualStop");
    if (actualStart == null) {
      log.warn("Job(" + getJobID() + "): actualStart should be defined before setting actualStop");
    } else if (actualStop.before(actualStart)) {
      log.warn(
          "Job("
              + getJobID()
              + "): actualStop ("
              + actualStop
              + ") is before actualStart: "
              + actualStart);
    }
    this.actualStop = (Date) actualStop.clone();
  }

  /**
   * Set the orderxml for this job.
   *
   * @param doc A orderxml to be used by this job
   */
  public void setOrderXMLDoc(HeritrixTemplate doc) {
    ArgumentNotValid.checkNotNull(doc, "doc");
    this.orderXMLdoc = doc;
  }

  /**
   * Gets a document representation of the order.xml associated with this Job.
   *
   * @return the XML as a org.dom4j.Document
   */
  public HeritrixTemplate getOrderXMLdoc() {
    return orderXMLdoc;
  }

  //    /**
  //     * Gets a list of document representations of the settings.xml's associated with this Job.
  //     *
  //     * @return the XML as an array of org.dom4j.Document
  //     */
  //    public Document[] getSettingsXMLdocs() {
  //        return settingsXMLdocs;
  //    }

  /**
   * Set the seedlist of the job from the seedList argument. Individual seeds are separated by a
   * '\n' character. Duplicate seeds are removed.
   *
   * @param seedList List of seeds as one String
   */
  public void setSeedList(String seedList) {
    ArgumentNotValid.checkNotNullOrEmpty(seedList, "seedList");
    seedListSet = new HashSet<>();
    BufferedReader reader = new BufferedReader(new StringReader(seedList));
    String seed;
    try {
      while ((seed = reader.readLine()) != null) {
        seedListSet.add(seed); // add to seedlist if not already there
      }
    } catch (IOException e) {
      // This never happens, as we're reading from a string!
      throw new IOFailure("IOException reading from seed string", e);
    } finally {
      IOUtils.closeQuietly(reader);
    }
  }

  /**
   * Get the seedlist as a String. The individual seeds are separated by the character '\n'. The
   * order of the seeds are unknown.
   *
   * @return the seedlist as a String
   */
  public String getSeedListAsString() {
    return StringUtils.conjoin("\n", seedListSet);
  }

  /**
   * Get the current status of this Job.
   *
   * @return the status as an int in the range 0 to 4.
   */
  public JobStatus getStatus() {
    return status;
  }

  /**
   * Sets status of this job.
   *
   * @param newStatus Must be one of the values STATUS_NEW, ..., STATUS_FAILED
   * @throws ArgumentNotValid in case of invalid status argument or invalid status change
   */
  public void setStatus(JobStatus newStatus) {
    ArgumentNotValid.checkNotNull(newStatus, "newStatus");
    if (!status.legalChange(newStatus)) {
      final String message =
          "Status change from " + status + " to " + newStatus + " is not allowed";
      log.debug(message);
      throw new ArgumentNotValid(message);
    }

    if ((this.status == JobStatus.NEW || this.status == JobStatus.RESUBMITTED)
        && newStatus == JobStatus.SUBMITTED) {
      orderXMLdoc.configureQuotaEnforcer(
          maxObjectsIsSetByQuotaEnforcer, forceMaxBytesPerDomain, forceMaxObjectsPerDomain);
    }

    if (this.status == JobStatus.SUBMITTED && newStatus == JobStatus.STARTED) {
      setActualStart(new Date());
    }
    if (this.status == JobStatus.STARTED
        && (newStatus == JobStatus.DONE || newStatus == JobStatus.FAILED)) {
      setActualStop(new Date());
    }
    status = newStatus;
  }

  /**
   * Returns a map of domain names and name of their corresponding configuration.
   *
   * <p>The returned Map cannot be changed.
   *
   * @return a read-only Map (<String>, <String>)
   */
  public Map<String, String> getDomainConfigurationMap() {
    return Collections.unmodifiableMap(domainConfigurationMap);
  }

  /**
   * Gets the maximum number of objects harvested per domain.
   *
   * @return The maximum number of objects harvested per domain. 0 means no limit.
   */
  public long getMaxObjectsPerDomain() {
    return forceMaxObjectsPerDomain;
  }

  /**
   * Gets the maximum number of bytes harvested per domain.
   *
   * @return The maximum number of bytes harvested per domain. -1 means no limit.
   */
  public long getMaxBytesPerDomain() {
    return forceMaxBytesPerDomain;
  }

  /**
   * Get the edition number.
   *
   * @return The edition number
   */
  long getEdition() {
    return edition;
  }

  /**
   * Set the edition number.
   *
   * @param edition the new edition number
   */
  void setEdition(long edition) {
    this.edition = edition;
  }

  public void setHarvestChannel(HarvestChannel harvestChannel) {
    this.channel = harvestChannel.getName();
    this.isSnapshot = harvestChannel.isSnapshot();
  }

  /** @return the associated {@link HarvestChannel} name. */
  public String getChannel() {
    return channel;
  }

  /**
   * Sets the associated {@link HarvestChannel} name.
   *
   * @param channel the channel name
   */
  public void setChannel(String channel) {
    this.channel = channel;
  }

  /**
   * @return true if the job belongs to a snapshot harvest, false if it belongs to a focused
   *     harvest.
   */
  public boolean isSnapshot() {
    return isSnapshot;
  }

  /**
   * Sets whether job belongs to a snapshot or focused harvest.
   *
   * @param isSnapshot true if the job belongs to a snapshot harvest, false if it belongs to a
   *     focused harvest.
   */
  public void setSnapshot(boolean isSnapshot) {
    this.isSnapshot = isSnapshot;
  }

  @Override
  public String toString() {
    return "Job "
        + getJobID()
        + " (state = "
        + getStatus()
        + ", HD = "
        + getOrigHarvestDefinitionID()
        + ", channel = "
        + getChannel()
        + ", snapshot = "
        + isSnapshot()
        + ", forcemaxcount = "
        + getForceMaxObjectsPerDomain()
        + ", forcemaxbytes = "
        + getMaxBytesPerDomain()
        + ", forcemaxrunningtime = "
        + forceMaxRunningTime
        + ", orderxml = "
        + getOrderXMLName()
        + ", numconfigs = "
        + getDomainConfigurationMap().size()
        + ", created = "
        + getCreationDate()
        + (getSubmittedDate() != null ? ", submitted = " + getSubmittedDate() : "")
        + (getActualStart() != null ? ", started = " + getActualStart() : "")
        + (getActualStop() != null ? ", stopped = " + getActualStop() : "")
        + ")";
  }

  /** @return Returns the forceMaxObjectsPerDomain. 0 means no limit. */
  public long getForceMaxObjectsPerDomain() {
    return forceMaxObjectsPerDomain;
  }

  /**
   * Sets the maxObjectsPerDomain value.
   *
   * @param maxObjectsPerDomain The forceMaxObjectsPerDomain to set. 0 means no limit.
   * @throws IOFailure Thrown from auxiliary method editOrderXML_maxObjectsPerDomain.
   */
  protected void setMaxObjectsPerDomain(long maxObjectsPerDomain) {
    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }

    this.forceMaxObjectsPerDomain = maxObjectsPerDomain;
    orderXMLdoc.setMaxObjectsPerDomain(
        maxObjectsPerDomain); // FIXME? add argument to maxObjectsIsSetByQuotaEnforcer to method
    // setMaxObjectsPerDomain
    // orderXMLdoc.editOrderXML_maxObjectsPerDomain(orderXMLdoc, maxObjectsPerDomain,
    //        maxObjectsIsSetByQuotaEnforcer);

    if (0L == maxObjectsPerDomain && 0L != forceMaxBytesPerDomain) {
      setMaxBytesPerDomain(0L);
    }
  }

  /**
   * Set the maxbytes per domain value.
   *
   * @param maxBytesPerDomain The maxBytesPerDomain to set, or -1 for no limit.
   */
  protected void setMaxBytesPerDomain(long maxBytesPerDomain) {
    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }
    this.forceMaxBytesPerDomain = maxBytesPerDomain;
    orderXMLdoc.setMaxBytesPerDomain(maxBytesPerDomain);

    if (0L == maxBytesPerDomain && 0L != forceMaxObjectsPerDomain) {
      setMaxObjectsPerDomain(0L);
    }
  }

  /**
   * Set the maxJobRunningTime value.
   *
   * @param maxJobRunningTime The maxJobRunningTime in seconds to set, or 0 for no limit.
   */
  protected void setMaxJobRunningTime(long maxJobRunningTime) {
    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }
    this.forceMaxRunningTime = maxJobRunningTime;
    orderXMLdoc.setMaxJobRunningTime(maxJobRunningTime);
  }

  /** @return Returns the MaxJobRunningTime. 0 means no limit. */
  public long getMaxJobRunningTime() {
    return forceMaxRunningTime;
  }

  /**
   * Get the harvestNum for this job. The number reflects which run of the harvest definition this
   * is.
   *
   * @return the harvestNum for this job.
   */
  public int getHarvestNum() {
    return harvestNum;
  }

  /**
   * Set the harvestNum for this job. The number reflects which run of the harvest definition this
   * is. ONLY TO BE USED IN THE CONSTRUCTION PHASE.
   *
   * @param harvestNum a given harvestNum
   */
  public void setHarvestNum(int harvestNum) {
    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }
    this.harvestNum = harvestNum;
  }

  /**
   * Get the list of harvest errors for this job. If no harvest errors, null is returned This value
   * is not meaningful until the job is finished (FAILED,DONE, RESUBMITTED)
   *
   * @return the harvest errors for this job or null if no harvest errors.
   */
  public String getHarvestErrors() {
    return harvestErrors;
  }

  /**
   * Append to the list of harvest errors for this job. Nothing happens, if argument harvestErrors
   * is null.
   *
   * @param harvestErrors a string containing harvest errors (may be null)
   */
  public void appendHarvestErrors(String harvestErrors) {
    if (harvestErrors != null) {
      if (this.harvestErrors == null) {
        this.harvestErrors = harvestErrors;
      } else {
        this.harvestErrors += "\n" + harvestErrors;
      }
    }
  }

  /**
   * Get the list of harvest error details for this job. If no harvest error details, null is
   * returned This value is not meaningful until the job is finished (FAILED,DONE, RESUBMITTED)
   *
   * @return the list of harvest error details for this job or null if no harvest error details.
   */
  public String getHarvestErrorDetails() {
    return harvestErrorDetails;
  }

  /**
   * Append to the list of harvest error details for this job. Nothing happens, if argument
   * harvestErrorDetails is null.
   *
   * @param harvestErrorDetails a string containing harvest error details.
   */
  public void appendHarvestErrorDetails(String harvestErrorDetails) {
    if (harvestErrorDetails != null) {
      if (this.harvestErrorDetails == null) {
        this.harvestErrorDetails = harvestErrorDetails;
      } else {
        this.harvestErrorDetails += "\n" + harvestErrorDetails;
      }
    }
  }

  /**
   * Get the list of upload errors. If no upload errors, null is returned. This value is not
   * meaningful until the job is finished (FAILED,DONE, RESUBMITTED)
   *
   * @return the list of upload errors as String, or null if no upload errors.
   */
  public String getUploadErrors() {
    return uploadErrors;
  }

  /**
   * Append to the list of upload errors. Nothing happens, if argument uploadErrors is null.
   *
   * @param uploadErrors a string containing upload errors.
   */
  public void appendUploadErrors(String uploadErrors) {
    if (uploadErrors != null) {
      if (this.uploadErrors == null) {
        this.uploadErrors = uploadErrors;
      } else {
        this.uploadErrors += "\n" + uploadErrors;
      }
    }
  }

  /**
   * Get the list of upload error details. If no upload error details, null is returned. This value
   * is not meaningful until the job is finished (FAILED,DONE, RESUBMITTED)
   *
   * @return the list of upload error details as String, or null if no upload error details
   */
  public String getUploadErrorDetails() {
    return uploadErrorDetails;
  }

  /**
   * Append to the list of upload error details. Nothing happens, if argument uploadErrorDetails is
   * null.
   *
   * @param uploadErrorDetails a string containing upload error details.
   */
  public void appendUploadErrorDetails(String uploadErrorDetails) {
    if (uploadErrorDetails != null) {
      if (this.uploadErrorDetails == null) {
        this.uploadErrorDetails = uploadErrorDetails;
      } else {
        this.uploadErrorDetails += "\n" + uploadErrorDetails;
      }
    }
  }

  /**
   * Get the ID for the job which this job was resubmitted as. If null, this job has not been
   * resubmitted.
   *
   * @return this ID.
   */
  public Long getResubmittedAsJob() {
    return resubmittedAsJobWithID;
  }

  /**
   * Set the Date for when this job was submitted. If null, this job has not been submitted.
   *
   * @param submittedDate The date when this was submitted
   */
  public void setSubmittedDate(Date submittedDate) {
    this.submittedDate = submittedDate;
  }

  /**
   * Set the Date for when this job was created. If null, this job has not been created.
   *
   * @param creationDate The date when this was created
   */
  public void setCreationDate(Date creationDate) {
    this.creationDate = creationDate;
  }

  /**
   * Set the ID for the job which this job was resubmitted as.
   *
   * @param resubmittedAsJob An Id for a new job.
   */
  public void setResubmittedAsJob(Long resubmittedAsJob) {
    this.resubmittedAsJobWithID = resubmittedAsJob;
  }

  /**
   * @return id of the job that this job is supposed to continue using Heritrix recover-log or null
   *     if it starts from scratch.
   */
  public Long getContinuationOf() {
    return this.continuationOF;
  }

  @Override
  public String getHarvestFilenamePrefix() {
    if (this.harvestnamePrefix == null) {
      log.warn(
          "HarvestnamePrefix not yet set for job {}. Set it by using the naming scheme. "
              + "This should only happen for old jobs being read",
          this.jobID);
      setDefaultHarvestNamePrefix();
    }
    return this.harvestnamePrefix;
  }

  /** @param prefix */
  public void setHarvestFilenamePrefix(String prefix) {
    this.harvestnamePrefix = prefix;
  }

  /** @return the forceMaxBytesPerDomain */
  public long getForceMaxBytesPerDomain() {
    return forceMaxBytesPerDomain;
  }

  /** @return the configurationSetsObjectLimit */
  public boolean isConfigurationSetsObjectLimit() {
    return configurationSetsObjectLimit;
  }

  /** @return the configurationSetsByteLimit */
  public boolean isConfigurationSetsByteLimit() {
    return configurationSetsByteLimit;
  }

  /** @return the minCountObjects */
  public long getMinCountObjects() {
    return minCountObjects;
  }

  /** @return the maxCountObjects */
  public long getMaxCountObjects() {
    return maxCountObjects;
  }

  /** @return the totalCountObjects */
  public long getTotalCountObjects() {
    return totalCountObjects;
  }

  void setDefaultHarvestNamePrefix() {
    if (getJobID() != null) {
      ArchiveFileNaming naming = ArchiveFileNamingFactory.getInstance();
      log.debug("Applying the default ArchiveFileNaming class '{}'.", naming.getClass().getName());
      final String prefix = naming.getPrefix(this);
      setHarvestFilenamePrefix(prefix);
      log.debug("The harvestPrefix of this job is: {}", prefix);
    } else {
      log.warn(
          "The harvestnamePrefix is not set now, as it depends on the JobID, which is not set yet");
    }
  }

  /** @return the harvest-audience. */
  public String getHarvestAudience() {
    return harvestAudience;
  }

  /**
   * Set the harvest audience for this job. Taken from the harvestdefinition that generated this
   * job.
   *
   * @param theAudience the harvest-audience.
   */
  public void setHarvestAudience(String theAudience) {
    this.harvestAudience = theAudience;
  }

  ///////////// The following two methods are needed by harvestStatus-jobdetails.jsp
  // ////////////////////////////////////
  /**
   * Returns a list of sorted seeds for this job. The sorting is by domain, and inside each domain,
   * the list is sorted by url
   *
   * @return a list of sorted seeds for this job.
   */
  public List<String> getSortedSeedList() {
    Map<String, Set<String>> urlMap = new HashMap<String, Set<String>>();
    for (String seed : seedListSet) {
      String url;
      // Assume the protocol is http://, if it is missing
      if (!seed.matches(Constants.PROTOCOL_REGEXP)) {
        url = "http://" + seed;
      } else {
        url = seed;
      }
      String domain = getDomain(url);
      if (domain == null) {
        // stop processing this url, and continue to the next seed
        continue;
      }
      Set<String> set;
      if (urlMap.containsKey(domain)) {
        set = urlMap.get(domain);
      } else {
        set = new TreeSet<String>();
        urlMap.put(domain, set);
      }
      set.add(seed);
    }
    List<String> result = new ArrayList<String>();
    for (Set<String> set : urlMap.values()) {
      result.addAll(set);
    }
    return result;
  }
  /**
   * Get the domain, that the given URL belongs to.
   *
   * @param url an URL
   * @return the domain, that the given URL belongs to, or null if unable to do so.
   */
  private String getDomain(String url) {
    try {
      URL uri = new URL(url);
      return DomainUtils.domainNameFromHostname(uri.getHost());
    } catch (MalformedURLException e) {
      log.warn("The string '{}' is not a valid URL", url);
      return null;
    }
  }
}

Example #3

Show file

File: AbstractJobGenerator.java Project: netarchivesuite/netarchivesuite-svngit-migration

/**
 * A base class for {@link JobGenerator} implementations. It is recommended to extend this class to
 * implement a new job generator.
 *
 * <p>The base algorithm iterates over domain configurations within the harvest definition, and
 * according to the configuration ({@link HarvesterSettings#JOBGEN_DOMAIN_CONFIG_SUBSET_SIZE},
 * constitutes a subset of domain configurations from which one or more jobs will be generated.
 */
abstract class AbstractJobGenerator implements JobGenerator {

  /** Logger for this class. */
  private static Log log = LogFactory.getLog(AbstractJobGenerator.class);

  /** How many domain configurations to process in one go. */
  private final long DOMAIN_CONFIG_SUBSET_SIZE =
      Settings.getLong(HarvesterSettings.JOBGEN_DOMAIN_CONFIG_SUBSET_SIZE);

  /** Is deduplication enabled or disabled. * */
  private final boolean DEDUPLICATION_ENABLED =
      Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED);

  @Override
  public int generateJobs(HarvestDefinition harvest) {
    log.info("Generating jobs for harvestdefinition # " + harvest.getOid());
    int jobsMade = 0;
    final Iterator<DomainConfiguration> domainConfigurations = harvest.getDomainConfigurations();

    while (domainConfigurations.hasNext()) {
      List<DomainConfiguration> subset = new ArrayList<DomainConfiguration>();
      while (domainConfigurations.hasNext() && subset.size() < DOMAIN_CONFIG_SUBSET_SIZE) {
        subset.add(domainConfigurations.next());
      }

      Collections.sort(subset, getDomainConfigurationSubsetComparator(harvest));
      if (log.isTraceEnabled()) {
        log.trace(
            subset.size()
                + " domainconfigs now sorted and ready to processing "
                + "for harvest #"
                + harvest.getOid());
      }
      jobsMade += processDomainConfigurationSubset(harvest, subset.iterator());
    }
    harvest.setNumEvents(harvest.getNumEvents() + 1);

    if (!harvest.isSnapShot()) {
      PartialHarvest focused = (PartialHarvest) harvest;
      Schedule schedule = focused.getSchedule();
      int numEvents = harvest.getNumEvents();

      // Calculate next event
      Date now = new Date();
      Date nextEvent = schedule.getNextEvent(focused.getNextDate(), numEvents);

      // Refuse to schedule event in the past
      if (nextEvent != null && nextEvent.before(now)) {
        int eventsSkipped = 0;
        while (nextEvent != null && nextEvent.before(now)) {
          nextEvent = schedule.getNextEvent(nextEvent, numEvents);
          eventsSkipped++;
        }
        if (log.isWarnEnabled()) {
          log.warn(
              "Refusing to schedule harvest definition '"
                  + harvest.getName()
                  + "' in the past. Skipped "
                  + eventsSkipped
                  + " events. Old nextDate was "
                  + focused.getNextDate()
                  + " new nextDate is "
                  + nextEvent);
        }
      }

      // Set next event
      focused.setNextDate(nextEvent);
      if (log.isTraceEnabled()) {
        log.trace(
            "Next event for harvest definition "
                + harvest.getName()
                + " happens: "
                + (nextEvent == null ? "Never" : nextEvent.toString()));
      }
    }

    log.info(
        "Finished generating " + jobsMade + " jobs for harvestdefinition # " + harvest.getOid());
    return jobsMade;
  }

  /**
   * Instantiates a new job.
   *
   * @param cfg the {@link DomainConfiguration} being processed
   * @param harvest the {@link HarvestDefinition} being processed
   * @return an instance of {@link Job}
   */
  public static Job getNewJob(HarvestDefinition harvest, DomainConfiguration cfg) {
    HarvestChannelDAO harvestChannelDao = HarvestChannelDAO.getInstance();
    HarvestChannel channel = harvestChannelDao.getChannelForHarvestDefinition(harvest.getOid());
    if (channel == null) {
      log.info(
          "No channel mapping registered for harvest id "
              + harvest.getOid()
              + ", will use default.");
      channel = harvestChannelDao.getDefaultChannel(harvest.isSnapShot());
    }
    if (harvest.isSnapShot()) {
      return Job.createSnapShotJob(
          harvest.getOid(),
          channel,
          cfg,
          harvest.getMaxCountObjects(),
          harvest.getMaxBytes(),
          ((FullHarvest) harvest).getMaxJobRunningTime(),
          harvest.getNumEvents());
    }
    return Job.createJob(harvest.getOid(), channel, cfg, harvest.getNumEvents());
  }

  /**
   * Returns a comparator used to sort the subset of {@link #DOMAIN_CONFIG_SUBSET_SIZE}
   * configurations that are scanned at each iteration.
   *
   * @param harvest the {@link HarvestDefinition} being processed.
   * @return a comparator
   */
  protected abstract Comparator<DomainConfiguration> getDomainConfigurationSubsetComparator(
      HarvestDefinition harvest);

  /**
   * Create new jobs from a collection of configurations. All configurations must use the same
   * order.xml file.Jobs
   *
   * @param harvest the {@link HarvestDefinition} being processed.
   * @param domainConfSubset the configurations to use to create the jobs
   * @return The number of jobs created
   * @throws ArgumentNotValid if any of the parameters is null or if the cfglist does not contain
   *     any configurations
   */
  protected abstract int processDomainConfigurationSubset(
      HarvestDefinition harvest, Iterator<DomainConfiguration> domainConfSubset);

  @Override
  public boolean canAccept(Job job, DomainConfiguration cfg) {
    if (!checkAddDomainConfInvariant(job, cfg)) {
      return false;
    }
    return checkSpecificAcceptConditions(job, cfg);
  }

  /**
   * Called by {@link #canAccept(Job, DomainConfiguration)}. Tests the implementation-specific
   * conditions to accept the given {@link DomainConfiguration} in the given {@link Job}. It is
   * assumed that {@link #checkAddDomainConfInvariant(Job, DomainConfiguration)} has already passed.
   *
   * @param job the {@link Job} n=being built
   * @param cfg the {@link DomainConfiguration} to test
   * @return true if the configuration passes the conditions.
   */
  protected abstract boolean checkSpecificAcceptConditions(Job job, DomainConfiguration cfg);

  /**
   * Once the job has been filled with {@link DomainConfiguration}s, performs the following
   * operations:
   *
   * <ol>
   *   <li>Edit the harvest template to add/remove deduplicator configuration.
   *   <li>
   * </ol>
   *
   * @param job the job
   */
  protected void editJobOrderXml(Job job) {
    Document doc = job.getOrderXMLdoc();
    if (DEDUPLICATION_ENABLED) {
      // Check that the Deduplicator element is present in the
      // OrderXMl and enabled. If missing or disabled log a warning
      if (!HeritrixTemplate.isDeduplicationEnabledInTemplate(doc)) {
        if (log.isWarnEnabled()) {
          log.warn(
              "Unable to perform deduplication for this job"
                  + " as the required DeDuplicator element is "
                  + "disabled or missing from template");
        }
      }
    } else {
      // Remove deduplicator Element from OrderXML if present
      Node xpathNode = doc.selectSingleNode(HeritrixTemplate.DEDUPLICATOR_XPATH);
      if (xpathNode != null) {
        xpathNode.detach();
        job.setOrderXMLDoc(doc);
        if (log.isInfoEnabled()) {
          log.info("Removed DeDuplicator element because " + "Deduplication is disabled");
        }
      }
    }
  }

  /**
   * Tests that:
   *
   * <ol>
   *   <li>The given domain configuration and job are not null.
   *   <li>The job does not already contain the given domain configuration.
   *   <li>The domain configuration has the same order xml name as the first inserted domain config.
   * </ol>
   *
   * @param job a given Job
   * @param cfg a given DomainConfiguration
   * @return true, if the given DomainConfiguration can be inserted into the given job
   */
  private boolean checkAddDomainConfInvariant(Job job, DomainConfiguration cfg) {
    ArgumentNotValid.checkNotNull(job, "job");
    ArgumentNotValid.checkNotNull(cfg, "cfg");

    // check if domain in DomainConfiguration cfg is not already in this job
    // domainName is used as key in domainConfigurationMap
    if (job.getDomainConfigurationMap().containsKey(cfg.getDomainName())) {
      if (log.isDebugEnabled()) {
        log.debug("Job already has a configuration for Domain '" + cfg.getDomainName() + "'.");
      }
      return false;
    }

    // check if template is same as this job.
    String orderXMLname = job.getOrderXMLName();
    if (!orderXMLname.equals(cfg.getOrderXmlName())) {
      if (log.isDebugEnabled()) {
        log.debug(
            "This Job only accept configurations "
                + "using the harvest template '"
                + orderXMLname
                + "'. This configuration uses the harvest template '"
                + cfg.getOrderXmlName()
                + "'.");
      }
      return false;
    }

    return true;
  }
}