/**
   * Tests that:
   *
   * <ol>
   *   <li>The given domain configuration and job are not null.
   *   <li>The job does not already contain the given domain configuration.
   *   <li>The domain configuration has the same order xml name as the first inserted domain config.
   * </ol>
   *
   * @param job a given Job
   * @param cfg a given DomainConfiguration
   * @return true, if the given DomainConfiguration can be inserted into the given job
   */
  private boolean checkAddDomainConfInvariant(Job job, DomainConfiguration cfg) {
    ArgumentNotValid.checkNotNull(job, "job");
    ArgumentNotValid.checkNotNull(cfg, "cfg");

    // check if domain in DomainConfiguration cfg is not already in this job
    // domainName is used as key in domainConfigurationMap
    if (job.getDomainConfigurationMap().containsKey(cfg.getDomainName())) {
      if (log.isDebugEnabled()) {
        log.debug("Job already has a configuration for Domain '" + cfg.getDomainName() + "'.");
      }
      return false;
    }

    // check if template is same as this job.
    String orderXMLname = job.getOrderXMLName();
    if (!orderXMLname.equals(cfg.getOrderXmlName())) {
      if (log.isDebugEnabled()) {
        log.debug(
            "This Job only accept configurations "
                + "using the harvest template '"
                + orderXMLname
                + "'. This configuration uses the harvest template '"
                + cfg.getOrderXmlName()
                + "'.");
      }
      return false;
    }

    return true;
  }
  /**
   * Assign a StoreMessage to a specific filename. If the filename is already associated with a
   * StoreMessage, then this StoreMessage will be overwritten by the new StoreMessage.
   *
   * @param filename The name of the file to have a StoreMessage assigned.
   * @param msg The StoreMessage to be assigned to a file.
   * @throws ArgumentNotValid If the StoreMessage is null or if the filename is either null or the
   *     empty string.
   */
  @Override
  public void setReplyInfo(String filename, StoreMessage msg) throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(msg, "StoreMessage msg");
    ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");

    // put into the map, and overwrite any existing mapping.
    storeEntries.put(filename, msg);
  }
  @Override
  public boolean postProcess(InputStream input, OutputStream output) {
    ArgumentNotValid.checkNotNull(input, "InputStream input");
    ArgumentNotValid.checkNotNull(output, "OutputStream output");

    // Let the loaded job handle the post processing.
    loadBatchJob();
    return loadedJob.postProcess(input, output);
  }
  /**
   * Returns the ReplicaStoreState of a given file in a specific replica.
   *
   * @param filename The name of the file for the ReplicaStoreState.
   * @param replicaChannelName The name of the identification channel for uniquely identifying the
   *     replica of for the ReplicaStoreState.
   * @return The ReplicaStoreState of a given file in a specific replica.
   * @throws ArgumentNotValid If the filename or the replica id is null or the empty string.
   */
  @Override
  public ReplicaStoreState getState(String filename, String replicaChannelName)
      throws ArgumentNotValid {
    ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");
    ArgumentNotValid.checkNotNullOrEmpty(replicaChannelName, "String replicaChannelName");

    Replica rep = Channels.retrieveReplicaFromIdentifierChannel(replicaChannelName);

    // retrieve the ReplicaStoreState from the database.
    return database.getReplicaStoreState(filename, rep.getId());
  }
  /**
   * Sets the checksum of a given file.
   *
   * <p>It should not be possible to change the checksum in the database through arcrepository.
   *
   * @param filename The name of the file to have the checksum changed.
   * @param checksum The new checksum for the file.
   * @throws ArgumentNotValid If either the filename or the checksum is either null or the empty
   *     string.
   * @throws IllegalState Always, since it is not allowed for arcrepository to change the checksum
   *     of a completed upload.
   */
  @Override
  public void setCheckSum(String filename, String checksum) throws ArgumentNotValid, IllegalState {
    ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");
    ArgumentNotValid.checkNotNullOrEmpty(checksum, "String checksum");

    // This will not be implemented.
    throw new IllegalState(
        "It is not possible to change the checksum of a "
            + " file in the database! Only the checksum of a specific "
            + "replicafileinfo.");
  }
  /**
   * Method for adding an entry for administration.
   *
   * @param filename The name of the file to be stored.
   * @param msg The StoreMessage of the entry.
   * @param checksum The checksum of the entry.
   * @throws ArgumentNotValid If either the filename or checksum is either null or the empty string.
   */
  @Override
  public void addEntry(String filename, StoreMessage msg, String checksum) throws ArgumentNotValid {
    ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");
    ArgumentNotValid.checkNotNullOrEmpty(checksum, "String checksum");

    // insert this into the entries map.
    storeEntries.put(filename, msg);

    // insert into database.
    database.insertNewFileForUpload(filename, checksum);
  }
  /**
   * Retrieves a set with the name of the files with a specific ReplicaStoreState in a specific
   * replica.
   *
   * @param rep The replica where the files belong.
   * @param state The ReplicaStoreState for the files.
   * @return A set with the names of the files with a specific ReplicaStoreState in a specific
   *     replica.
   * @throws ArgumentNotValid If the Replica or the ReplicaStoreState is null.
   */
  @Override
  public Set<String> getAllFileNames(Replica rep, ReplicaStoreState state) throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(rep, "Replica rep");
    ArgumentNotValid.checkNotNull(state, "ReplicaStoreState state");

    // initialise the set
    Set<String> res = new HashSet<String>();
    // put the collection of filenames into the set.
    res.addAll(database.retrieveFilenamesForReplicaEntries(rep.getId(), state));
    // return the set.
    return res;
  }
  /**
   * Message to signal from a BitarchiveServer to the BitarchiveMonitorServer that the Bit Archive
   * Application identified by BA_ApplicationId has completed its part of the batch job.
   *
   * <p>Holds status information: list of files processed and a list of ARC files (file names) on
   * which the batch job failed.
   *
   * @param to the channel to which this message is to be sent (must be a BAMON channel)
   * @param baAppId Identifier for the machine sending this message, usually containing the IP
   *     address and http port number
   * @param originatingBatchMsgId the Id field from the original batch message
   * @param rf he remote file reference containing the output of the batch job (may be null if no
   *     output is generated).
   * @throws ArgumentNotValid If the BA_ApplicationId or the originatingBatchMsgId are null or
   *     empty, or if the channel 'to' is null.
   */
  public BatchEndedMessage(
      ChannelID to, String baAppId, String originatingBatchMsgId, RemoteFile rf)
      throws ArgumentNotValid {
    super(to, Channels.getError());
    ArgumentNotValid.checkNotNull(to, "ChannelID to");
    ArgumentNotValid.checkNotNullOrEmpty(baAppId, "String baAppId");
    ArgumentNotValid.checkNotNullOrEmpty(originatingBatchMsgId, "String originatingBatchMsgId");

    this.baApplicationId = baAppId;
    this.originatingBatchMsgId = originatingBatchMsgId;
    this.rf = rf;
  }
  /**
   * Sets the store state of an entry to a specific value.
   *
   * @param filename The name of the file for the entry.
   * @param repChannelId The identification channel of the replica for the entry.
   * @param state The new state for the entry.
   * @throws ArgumentNotValid If the ReplicaStoreState is null, or if either the filename or the
   *     replica identification channel is either null or the empty string.
   */
  @Override
  public void setState(String filename, String repChannelId, ReplicaStoreState state)
      throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(state, "ReplicaStoreState state");
    ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");
    ArgumentNotValid.checkNotNullOrEmpty(repChannelId, "String repChannelId");

    // retrieve the replica
    Replica rep = Channels.retrieveReplicaFromIdentifierChannel(repChannelId);

    // update the database.
    database.setReplicaStoreState(filename, rep.getId(), state);
  }
  /**
   * Message to signal from a BitarchiveServer to the BitarchiveMonitorServer that the Bit Archive
   * Application identified by BA_ApplicationId has completed its part of the batch job.
   *
   * <p>Holds status information: list of files processed and a list of ARC files (file names) on
   * which the batch job failed.
   *
   * @param to the channel to which this message is to be sent (must be a BAMON channel)
   * @param originatingBatchMsgId the Id field from the original batch message
   * @param status The object containing status info.
   */
  public BatchEndedMessage(ChannelID to, String originatingBatchMsgId, BatchStatus status) {
    super(to, Channels.getError());
    ArgumentNotValid.checkNotNull(to, "to");
    ArgumentNotValid.checkNotNullOrEmpty(originatingBatchMsgId, "String originatingBatchMsgId");
    ArgumentNotValid.checkNotNull(status, "BatchStatus status");

    this.originatingBatchMsgId = originatingBatchMsgId;
    this.baApplicationId = status.getBitArchiveAppId();
    this.rf = status.getResultFile();
    this.noOfFilesProcessed = status.getNoOfFilesProcessed();
    this.filesFailed = status.getFilesFailed();
    this.exceptions = status.getExceptions();
  }
  /**
   * Determines whether a given file in a specific replica has a valid replica store state. By valid
   * means a replica store state other that UNKNOWN_UPLOAD_STATE.
   *
   * <p>TODO Find out if the assumption that all upload states besides UNKNOWN_UPLOAD_STATE are
   * acceptable!
   *
   * @param filename The name of the file for the ReplicaStoreState.
   * @param repChannelId The identification channel of the replica for the ReplicaStoreState.
   * @return Whether a given file in a specific replica has a valid store state.
   * @throws ArgumentNotValid If either the filenames or the replica identification channel is null
   *     or the empty string.
   */
  @Override
  public boolean hasState(String filename, String repChannelId) throws ArgumentNotValid {
    ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");
    ArgumentNotValid.checkNotNullOrEmpty(repChannelId, "String repChannelId");

    // retrieve the replica
    Replica rep = Channels.retrieveReplicaFromIdentifierChannel(repChannelId);

    // retrieve the state for the entry for the replica and filename
    ReplicaStoreState state = database.getReplicaStoreState(filename, rep.getId());

    // return whether the entry has a known upload state
    return state != ReplicaStoreState.UNKNOWN_UPLOAD_STATE;
  }
  public ExportFrontierReportCsvQuery(ServletRequest req) {

    String jobIdStr = UI_FIELD.JOB_ID.getValue(req);
    ArgumentNotValid.checkNotNullOrEmpty(jobIdStr, UI_FIELD.JOB_ID.name());

    jobId = Long.parseLong(jobIdStr);
  }
  /**
   * Initialise the batch job.
   *
   * @param regexp The regexp to match in the crawl.log lines.
   */
  public CrawlLogLinesMatchingRegexp(String regexp) {
    ArgumentNotValid.checkNotNullOrEmpty(regexp, "regexp");
    this.regexp = regexp;

    /** One week in milliseconds. */
    batchJobTimeout = 7 * Constants.ONE_DAY_IN_MILLIES;
  }
  /**
   * Method for telling whether a file entry exists.
   *
   * @param filename The name of the file, the existence of whose entry is to be determined.
   * @return Whether the entry exists.
   * @throws ArgumentNotValid If the filename is either null or empty.
   */
  @Override
  public boolean hasEntry(String filename) throws ArgumentNotValid {
    ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");

    // See if the file can be found in the database.
    return database.existsFileInDB(filename);
  }
  /**
   * Retrieves the StoreMessage of a specific file.
   *
   * @param filename The name of the file whose StoreMessage should be retrieved.
   * @return The StoreMessage corresponding to the file. A null is returned if the corresponding
   *     StoreMessage is not found.
   * @throws ArgumentNotValid If the filename is either null or the empty string.
   */
  @Override
  public StoreMessage removeReplyInfo(String filename) throws ArgumentNotValid {
    ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");

    // extract the entry from the
    return storeEntries.remove(filename);
  }
  /**
   * Determines whether the StoreMessage of a given file exists.
   *
   * @param filename The name of the file to which the existence of the StoreMessage should be
   *     determined.
   * @return Whether the StoreMessage of the file exists.
   * @throws ArgumentNotValid If the filename is null or the empty string.
   */
  @Override
  public boolean hasReplyInfo(String filename) throws ArgumentNotValid {
    ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");

    // check if a entry for the file can be found in the map.
    return storeEntries.containsKey(filename);
  }
 /**
  * Set the axis range.
  *
  * @param axis a numberAxis
  * @param range a range
  */
 private void setAxisRange(NumberAxis axis, double[] range) {
   if (range == null || range.length != 2) {
     axis.setAutoRange(true);
   } else {
     double lower = range[0];
     double upper = range[1];
     ArgumentNotValid.checkTrue(lower < upper, "Incorrect range");
     axis.setAutoRange(false);
     axis.setRange(new Range(lower, upper));
   }
 }
  /**
   * Calls the Unix sort command with the options <code>$filesNames -o
   * $outputfile -T WaybackSettings#WAYBACK_AGGREGATOR_TEMP_DIR.
   *
   * Sets the LC_ALL environment variable before making the call.
   *
   * @param files The files to merge and sort
   * @param outputFile The resulting sorted file
   * @param additionalArgs A list af extra arguments, which (if different from
   *                       null) are added to the sort call.<p> Note: If any
   *                       of the args contain a whitespace the call will
   *                       fail.
   */
  private void processFiles(File[] files, File outputFile, List<String> additionalArgs) {
    if (files.length == 0) {
      // Empty file list will cause sort to wait for further input,
      // and the call will therefore never return
      return;
    }

    Process p = null;

    try {
      List<String> inputFileList = new LinkedList<String>();
      for (int i = 0; i < files.length; i++) {
        if (files[i].exists() && files[i].isFile()) {
          inputFileList.add(files[i].getCanonicalPath());
        } else {
          log.warn(
              "File "
                  + files[i]
                  + " doesn't exist or isn't a regular file, "
                  + "dropping from list of files to "
                  + "sort and merge");
        }
      }
      List<String> cmd = new LinkedList<String>();
      // Prepare to run the unix sort command, see sort manual page for
      // details
      cmd.add("sort");
      cmd.addAll(inputFileList);
      cmd.add("-o");
      cmd.add(outputFile.getCanonicalPath());
      cmd.add("-T");
      cmd.add(Settings.get(WaybackSettings.WAYBACK_AGGREGATOR_TEMP_DIR));
      if (additionalArgs != null && !additionalArgs.isEmpty()) {
        for (String argument : additionalArgs) {
          ArgumentNotValid.checkTrue(
              argument.indexOf(' ') == -1,
              "The argument '" + argument + "' contains spaces, this isn't allowed ");
        }
        cmd.addAll(additionalArgs);
      }
      ProcessBuilder pb = new ProcessBuilder(cmd);
      // Reset all locale definitions
      pb.environment().put("LC_ALL", "C");
      // Run the command in the user.dir directory
      pb.directory(new File(System.getProperty("user.dir")));
      p = pb.start();
      p.waitFor();
      if (p.exitValue() != 0) {
        log.error("Failed to sort index files, sort exited with " + "return code " + p.exitValue());
      }
    } catch (Exception e) {
      log.error("Failed to aggregate indexes ", e);
    }
  }
 /**
  * Unpacks and calls accept() on the message object.
  *
  * <p>This method catches <b>all</b> exceptions and logs them.
  *
  * @param msg a ObjectMessage
  */
 public void onMessage(Message msg) {
   ArgumentNotValid.checkNotNull(msg, "Message msg");
   log.trace("Message received:\n{}", msg.toString());
   try {
     ((ArchiveMessage) JMSConnection.unpack(msg)).accept(this);
   } catch (ClassCastException e) {
     log.warn("Invalid message type", e);
   } catch (Throwable t) {
     log.warn("Error processing message '{}'", msg, t);
   }
 }
  /**
   * Creates a new NetarkivetMessage.
   *
   * @param to the initial receiver of the message
   * @param replyTo the initial sender of the message
   * @throws ArgumentNotValid if to==replyTo, the replyTo parameter is a topic instead of a queue,
   *     or there is a null parameter.
   */
  protected NetarkivetMessage(ChannelID to, ChannelID replyTo) {
    ArgumentNotValid.checkNotNull(to, "to");
    ArgumentNotValid.checkNotNull(replyTo, "replyTo");

    if (to.getName().equals(replyTo.getName())) {
      throw new ArgumentNotValid("to and replyTo should not be equal.");
    }

    // Have not implemented replying to a topic because there is no use
    // for it in our current architecture
    if (Channels.isTopic(replyTo.getName())) {
      throw new ArgumentNotValid(
          "Reply channel must be queue but " + replyTo.toString() + " is a Topic");
    }

    this.to = to;
    this.replyTo = replyTo;
    this.id = null;
    this.replyOfId = null;
  }
  /**
   * Retrieves the checksum of a given file.
   *
   * @param filename The name of the file, whose checksum should be retrieved.
   * @return The checksum of the file.
   * @throws ArgumentNotValid If the filename is either null or the empty string.
   */
  @Override
  public String getCheckSum(String filename) throws ArgumentNotValid {
    ArgumentNotValid.checkNotNullOrEmpty(filename, "String filename");

    // Ensure that we have the file requested.
    if (!hasEntry(filename)) {
      throw new UnknownID("Don't know anything about file '" + filename + "'");
    }

    // Retrieve the checksum for a specific entry.
    return database.getChecksum(filename);
  }
  /**
   * Create a new batch job that runs the loaded class.
   *
   * @param classFile the classfile for the batch job we want to run.
   * @param arguments The arguments for the batchjobs. This can be null.
   * @throws ArgumentNotValid If the classfile is null.
   */
  public LoadableFileBatchJob(File classFile, List<String> arguments) throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(classFile, "File classFile");
    fileContents = FileUtils.readBinaryFile(classFile);
    fileName = classFile.getName();
    if (arguments == null) {
      this.args = new ArrayList<String>();
    } else {
      this.args = arguments;
    }

    loadBatchJob();
  }
 /**
  * Process a record on crawl log concerning the given domain to result.
  *
  * @param record The record to process.
  * @param os The output stream for the result.
  * @throws ArgumentNotValid on null parameters
  * @throws IOFailure on trouble processing the record.
  */
 @Override
 public void processRecord(ArchiveRecordBase record, OutputStream os) {
   ArgumentNotValid.checkNotNull(record, "ArchiveRecordBase record");
   ArgumentNotValid.checkNotNull(os, "OutputStream os");
   BufferedReader arcreader = new BufferedReader(new InputStreamReader(record.getInputStream()));
   try {
     for (String line = arcreader.readLine(); line != null; line = arcreader.readLine()) {
       if (line.matches(regexp)) {
         os.write(line.getBytes("UTF-8"));
         os.write('\n');
       }
     }
   } catch (IOException e) {
     throw new IOFailure("Unable to process (w)arc record", e);
   } finally {
     try {
       arcreader.close();
     } catch (IOException e) {
       log.warn("unable to close arcreader probably", e);
     }
   }
 }
Example #24
0
 /**
  * Set the actual time when this job was started.
  *
  * <p>Sends a notification, if actualStart is set to a time after actualStop.
  *
  * @param actualStart A Date object representing the time when this job was started.
  */
 public void setActualStart(Date actualStart) {
   ArgumentNotValid.checkNotNull(actualStart, "actualStart");
   if (actualStop != null && actualStop.before(actualStart)) {
     log.warn(
         "Job("
             + getJobID()
             + "): Start time ("
             + actualStart
             + ") is after end time: "
             + actualStop);
   }
   this.actualStart = (Date) actualStart.clone();
 }
Example #25
0
 /**
  * Set the actual time when this job was stopped/completed. Sends a notification, if actualStop is
  * set to a time before actualStart.
  *
  * @param actualStop A Date object representing the time when this job was stopped.
  * @throws ArgumentNotValid
  */
 public void setActualStop(Date actualStop) throws ArgumentNotValid {
   ArgumentNotValid.checkNotNull(actualStop, "actualStop");
   if (actualStart == null) {
     log.warn("Job(" + getJobID() + "): actualStart should be defined before setting actualStop");
   } else if (actualStop.before(actualStart)) {
     log.warn(
         "Job("
             + getJobID()
             + "): actualStop ("
             + actualStop
             + ") is before actualStart: "
             + actualStart);
   }
   this.actualStop = (Date) actualStop.clone();
 }
Example #26
0
 /**
  * Set the seedlist of the job from the seedList argument. Individual seeds are separated by a
  * '\n' character. Duplicate seeds are removed.
  *
  * @param seedList List of seeds as one String
  */
 public void setSeedList(String seedList) {
   ArgumentNotValid.checkNotNullOrEmpty(seedList, "seedList");
   seedListSet = new HashSet<>();
   BufferedReader reader = new BufferedReader(new StringReader(seedList));
   String seed;
   try {
     while ((seed = reader.readLine()) != null) {
       seedListSet.add(seed); // add to seedlist if not already there
     }
   } catch (IOException e) {
     // This never happens, as we're reading from a string!
     throw new IOFailure("IOException reading from seed string", e);
   } finally {
     IOUtils.closeQuietly(reader);
   }
 }
Example #27
0
  /**
   * Sets status of this job.
   *
   * @param newStatus Must be one of the values STATUS_NEW, ..., STATUS_FAILED
   * @throws ArgumentNotValid in case of invalid status argument or invalid status change
   */
  public void setStatus(JobStatus newStatus) {
    ArgumentNotValid.checkNotNull(newStatus, "newStatus");
    if (!status.legalChange(newStatus)) {
      final String message =
          "Status change from " + status + " to " + newStatus + " is not allowed";
      log.debug(message);
      throw new ArgumentNotValid(message);
    }

    if ((this.status == JobStatus.NEW || this.status == JobStatus.RESUBMITTED)
        && newStatus == JobStatus.SUBMITTED) {
      orderXMLdoc.configureQuotaEnforcer(
          maxObjectsIsSetByQuotaEnforcer, forceMaxBytesPerDomain, forceMaxObjectsPerDomain);
    }

    if (this.status == JobStatus.SUBMITTED && newStatus == JobStatus.STARTED) {
      setActualStart(new Date());
    }
    if (this.status == JobStatus.STARTED
        && (newStatus == JobStatus.DONE || newStatus == JobStatus.FAILED)) {
      setActualStop(new Date());
    }
    status = newStatus;
  }
Example #28
0
  /**
   * Package private constructor for common initialisation.
   *
   * @param harvestID the id of the harvestdefinition
   * @param cfg the configuration to base the Job on
   * @param orderXMLdoc
   * @param channel the channel on which the job will be submitted.
   * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain,
   *     overrides individual configuration settings. -1 means no limit
   * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for
   *     no limit.
   * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
   * @param harvestNum the run number of the harvest definition
   * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit <
   *     -1
   */
  public Job(
      Long harvestID,
      DomainConfiguration cfg,
      HeritrixTemplate orderXMLdoc,
      HarvestChannel channel,
      long forceMaxObjectsPerDomain,
      long forceMaxBytesPerDomain,
      long forceMaxJobRunningTime,
      int harvestNum)
      throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(cfg, "cfg");
    ArgumentNotValid.checkNotNull(harvestID, "harvestID");
    ArgumentNotValid.checkNotNegative(harvestID, "harvestID");
    ArgumentNotValid.checkNotNull(channel, "channel");

    if (forceMaxObjectsPerDomain < -1) {
      String msg = "forceMaxObjectsPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }
    if (forceMaxBytesPerDomain < -1) {
      String msg = "forceMaxBytesPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }

    if (forceMaxBytesPerDomain == 0L) {
      log.warn(
          "forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain");
    }

    if (forceMaxObjectsPerDomain == 0L) {
      log.warn(
          "forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain");
    }

    // setup initial members
    domainConfigurationMap = new HashMap<>();
    origHarvestDefinitionID = harvestID;
    orderXMLname = cfg.getOrderXmlName();
    this.orderXMLdoc = orderXMLdoc;

    setHarvestChannel(channel);

    long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects());
    setMaxObjectsPerDomain(maxObjects);
    configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain);

    long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes());
    setMaxBytesPerDomain(maxBytes);
    configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain);

    long expectation =
        cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
    maxCountObjects = expectation;
    minCountObjects = expectation;
    this.harvestNum = harvestNum;

    addConfiguration(cfg);

    setMaxJobRunningTime(forceMaxJobRunningTime);

    setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT));

    setAttributes(cfg.getAttributesAndTypes());

    orderXMLdoc.enableOrDisableDeduplication(
        Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED));

    status = JobStatus.NEW;
  }
Example #29
0
 /**
  * Set the orderxml for this job.
  *
  * @param doc A orderxml to be used by this job
  */
 public void setOrderXMLDoc(HeritrixTemplate doc) {
   ArgumentNotValid.checkNotNull(doc, "doc");
   this.orderXMLdoc = doc;
 }
Example #30
0
  /**
   * Adds a configuration to this Job. Seedlists and settings are updated accordingly.
   *
   * @param cfg the configuration to add
   * @throws ArgumentNotValid if cfg is null or cfg uses a different orderxml than this job or if
   *     this job already contains a configuration associated with domain of configuration cfg.
   */
  public void addConfiguration(DomainConfiguration cfg) {
    ArgumentNotValid.checkNotNull(cfg, "cfg");
    if (domainConfigurationMap.containsKey(cfg.getDomainName())) {
      throw new ArgumentNotValid(
          "Job already has a configuration for Domain " + cfg.getDomainName());
    }

    if (log.isTraceEnabled()) {
      log.trace("Adding configuration '{}' to job '{}'", cfg, cfg.getName());
    }

    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }

    if (!cfg.getOrderXmlName().equals(getOrderXMLName())) {
      throw new ArgumentNotValid(
          "Job requires the orderxml file:'"
              + getOrderXMLName()
              + "' not:'"
              + cfg.getOrderXmlName()
              + "' used by the configuration:'"
              + cfg.getName());
    }

    domainConfigurationMap.put(cfg.getDomainName(), cfg.getName());

    // Add the seeds from the configuration to the Job seeds.
    // Take care of duplicates.
    for (Iterator<SeedList> itt = cfg.getSeedLists(); itt.hasNext(); ) {
      SeedList seed = itt.next();
      List<String> seeds = seed.getSeeds();
      for (String seedUrl : seeds) {
        seedListSet.add(seedUrl); // duplicates is silently ignored

        // TODO remove when heritrix implements this functionality
        // try to convert a seed into a Internationalized Domain Name
        try {
          String seedASCII = seedUrl;
          // It is rare to see these seeds, but they need to be
          // correctly idnaized
          if (seedUrl.contains(":") || seedUrl.contains("/")) {
            String normalizedUrl = seedUrl;
            if (!normalizedUrl.matches("^[a-zA-Z]+:.*")) {
              // If no protocol is given, assume http
              normalizedUrl = "http://" + normalizedUrl;
            }
            URL url = new URL(normalizedUrl);
            String domainName = url.getHost();
            String domainNameASCII = IDNA.toASCII(domainName);
            if (!domainName.equals(domainNameASCII)) {
              // If the domain name changed, replace that in the
              // seed.
              seedASCII = seedUrl.replaceFirst(Pattern.quote(domainName), domainNameASCII);
            }
          } else {
            seedASCII = IDNA.toASCII(seedUrl);
          }
          if (!seedASCII.equals(seedUrl)) {
            log.trace("Converted {} to {}", seedUrl, seedASCII);
            // Note that duplicates is silently ignored
            seedListSet.add(seedASCII);
          }
        } catch (IDNAException e) {
          log.trace("Cannot convert seed {} to ASCII", seedUrl, e);
        } catch (MalformedURLException e) {
          log.trace("Cannot convert seed {} to ASCII", seedUrl, e);
        }
      }
    }

    orderXMLdoc.editOrderXMLAddPerDomainCrawlerTraps(cfg);

    // TODO update limits in settings files - see also bug 269

    // Update estimates of job size
    long expectation =
        cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
    maxCountObjects = Math.max(expectation, maxCountObjects);
    minCountObjects = Math.min(expectation, minCountObjects);
    totalCountObjects += expectation;

    configsChanged = true;

    assert (maxCountObjects >= minCountObjects) : "basic invariant";
  }