コード例 #1
1
  /**
   * Create bitarchive with access denied to location of admin data verify that exceptions are
   * thrown
   */
  @Test
  public void testAccessDenied() {
    // Make sure archive exists
    assertTrue("Inaccessible archive dir must exist", NOACCESS_ARCHIVE_DIR.exists());

    if (NOACCESS_ARCHIVE_DIR.canWrite()) {
      NOACCESS_ARCHIVE_DIR.setReadOnly();
    }

    // and that admin file is inaccessible
    assertFalse(
        "Must not be able to write to inaccessible admin file", NOACCESS_ARCHIVE_DIR.canWrite());

    try {
      Settings.set(
          ArchiveSettings.BITARCHIVE_SERVER_FILEDIR, NOACCESS_ARCHIVE_DIR.getAbsolutePath());
      Bitarchive ba = Bitarchive.getInstance();
      ba.close();
      fail("Accessing read-only archive should throw exception"); // do not come here
    } catch (PermissionDenied e) {
      // Expected case
      StringAsserts.assertStringContains(
          "Should mention noaccess dir", "noaccess/filedir", e.getMessage());
    }
  }
  /** FIXME Fails in Hudson */
  public void failingTestExecute() {
    File arcFile = new File(TestInfo.BATCH_DIR, "MimeUrlSearch.jar");
    assertTrue(arcFile.isFile());

    Settings.set(
        "settings.common.batch.batchjobs.batchjob.class",
        "dk.netarkivet.common.utils.batch.UrlSearch");
    Settings.set("settings.common.batch.batchjobs.batchjob.jarfile", arcFile.getAbsolutePath());

    MockHttpServletRequest request =
        new MockHttpServletRequest() {
          @Override
          public Locale getLocale() {
            return new Locale("en");
          }

          @Override
          public int getRemotePort() {
            return 0; // To change body of implemented methods use File | Settings | File Templates.
          }

          @Override
          public String getLocalName() {
            return null; // To change body of implemented methods use File | Settings | File
                         // Templates.
          }

          @Override
          public String getLocalAddr() {
            return null; // To change body of implemented methods use File | Settings | File
                         // Templates.
          }

          @Override
          public int getLocalPort() {
            return 0; // To change body of implemented methods use File | Settings | File Templates.
          }
        };
    request.setupAddParameter(Constants.FILETYPE_PARAMETER, BatchFileType.Metadata.toString());
    request.setupAddParameter(Constants.JOB_ID_PARAMETER, "1234567890");
    request.setupAddParameter(
        Constants.BATCHJOB_PARAMETER, "dk.netarkivet.common.utils.batch.UrlSearch");
    request.setupAddParameter(Constants.REPLICA_PARAMETER, "BarOne");
    request.setupAddParameter("arg1", "DUMMY-ARG");
    request.setupAddParameter("arg2", "url");
    request.setupAddParameter("arg3", "mimetype");

    Locale l = new Locale("en");
    JspWriterMockup out = new JspWriterMockup();

    PageContext context = new WebinterfaceTestCase.TestPageContext(request, out, l);
    BatchGUI.execute(context);
  }
コード例 #3
0
 @Before
 public void setUp() {
   rs.setUp();
   mjms.setUp();
   listener = new CreateIndexListener();
   JMSConnectionFactory.getInstance().setListener(Channels.getTheIndexServer(), listener);
   Settings.set(
       CommonSettings.REMOTE_FILE_CLASS, "dk.netarkivet.common.distribute.NullRemoteFile");
   Settings.set(CommonSettings.CACHE_DIR, TestInfo.CACHE_DIR.getPath());
   mtf.setUp();
   pss.setUp();
   pse.setUp();
 }
  public void setUp() throws Exception {
    ChannelsTester.resetChannels();
    rs.setUp();
    mtf.setUp();
    utrf.setUp();

    JMSConnectionMockupMQ.useJMSConnectionMockupMQ();

    DatabaseTestUtils.takeDatabase(TestInfo.DATABASE_FILE, TestInfo.DATABASE_DIR);

    // define the settings for accessing the database
    Settings.set(ArchiveSettings.BASEURL_ARCREPOSITORY_ADMIN_DATABASE, TestInfo.DATABASE_URL);
    Settings.set(ArchiveSettings.MACHINE_ARCREPOSITORY_ADMIN_DATABASE, "");
    Settings.set(ArchiveSettings.PORT_ARCREPOSITORY_ADMIN_DATABASE, "");
    Settings.set(ArchiveSettings.DIR_ARCREPOSITORY_ADMIN_DATABASE, "");

    Settings.set(CommonSettings.NOTIFICATIONS_CLASS, PrintNotifications.class.getName());
  }
  /**
   * Calls the Unix sort command with the options <code>$filesNames -o
   * $outputfile -T WaybackSettings#WAYBACK_AGGREGATOR_TEMP_DIR.
   *
   * Sets the LC_ALL environment variable before making the call.
   *
   * @param files The files to merge and sort
   * @param outputFile The resulting sorted file
   * @param additionalArgs A list af extra arguments, which (if different from
   *                       null) are added to the sort call.<p> Note: If any
   *                       of the args contain a whitespace the call will
   *                       fail.
   */
  private void processFiles(File[] files, File outputFile, List<String> additionalArgs) {
    if (files.length == 0) {
      // Empty file list will cause sort to wait for further input,
      // and the call will therefore never return
      return;
    }

    Process p = null;

    try {
      List<String> inputFileList = new LinkedList<String>();
      for (int i = 0; i < files.length; i++) {
        if (files[i].exists() && files[i].isFile()) {
          inputFileList.add(files[i].getCanonicalPath());
        } else {
          log.warn(
              "File "
                  + files[i]
                  + " doesn't exist or isn't a regular file, "
                  + "dropping from list of files to "
                  + "sort and merge");
        }
      }
      List<String> cmd = new LinkedList<String>();
      // Prepare to run the unix sort command, see sort manual page for
      // details
      cmd.add("sort");
      cmd.addAll(inputFileList);
      cmd.add("-o");
      cmd.add(outputFile.getCanonicalPath());
      cmd.add("-T");
      cmd.add(Settings.get(WaybackSettings.WAYBACK_AGGREGATOR_TEMP_DIR));
      if (additionalArgs != null && !additionalArgs.isEmpty()) {
        for (String argument : additionalArgs) {
          ArgumentNotValid.checkTrue(
              argument.indexOf(' ') == -1,
              "The argument '" + argument + "' contains spaces, this isn't allowed ");
        }
        cmd.addAll(additionalArgs);
      }
      ProcessBuilder pb = new ProcessBuilder(cmd);
      // Reset all locale definitions
      pb.environment().put("LC_ALL", "C");
      // Run the command in the user.dir directory
      pb.directory(new File(System.getProperty("user.dir")));
      p = pb.start();
      p.waitFor();
      if (p.exitValue() != 0) {
        log.error("Failed to sort index files, sort exited with " + "return code " + p.exitValue());
      }
    } catch (Exception e) {
      log.error("Failed to aggregate indexes ", e);
    }
  }
コード例 #6
0
 @Before
 public void setUp() {
   rs.setUp();
   ulrf.setUp();
   mjms.setUp();
   mtf.setUp();
   pss.setUp();
   pse.setUp();
   marc.setUp();
   Settings.set(CommonSettings.NOTIFICATIONS_CLASS, RememberNotifications.class.getName());
 }
コード例 #7
0
 /** Create bitarchive from scratch, no admin data and log files exists */
 @Test
 public void testFromScratch() {
   LogbackRecorder lr = LogbackRecorder.startRecorder();
   assertFalse("No bitarchive should exist before creating it", NEW_ARCHIVE_DIR.exists());
   // Create new test archive and close it
   Settings.set(ArchiveSettings.BITARCHIVE_SERVER_FILEDIR, NEW_ARCHIVE_DIR.getAbsolutePath());
   Bitarchive ba = Bitarchive.getInstance();
   ba.close();
   // verify that the directory, admin and log files are created
   assertTrue("The archive dir should exist after creation", NEW_ARCHIVE_DIR.exists());
   assertTrue("Log file should exist after creation", !lr.isEmpty());
   lr.stopRecorder();
 }
コード例 #8
0
  @Before
  public void setUp() throws Exception {
    rs.setUp();
    mtf.setUp();
    JMSConnectionMockupMQ.useJMSConnectionMockupMQ();

    Settings.set(ArchiveSettings.DIRS_ARCREPOSITORY_ADMIN, TestInfo.WORKING_DIR.getAbsolutePath());

    if (!Replica.isKnownReplicaId("TWO") || !Replica.isKnownReplicaId("ONE")) {
      List<String> knownIds = new ArrayList<String>();

      fail(
          "These tests assume, that ONE and TWO are known replica ids. Only known replicas are: "
              + StringUtils.conjoin(", ", knownIds.toArray(Replica.getKnownIds())));
    }
    // super.setUp();
  }
コード例 #9
0
public class IndexerTestCase {

  private String oldClient = System.getProperty(CommonSettings.ARC_REPOSITORY_CLIENT);
  private String oldFileDir = System.getProperty("settings.common.arcrepositoryClient.fileDir");
  protected static File tempdir = new File(Settings.get(WaybackSettings.WAYBACK_INDEX_TEMPDIR));

  ReloadSettings rs = new ReloadSettings();

  @Before
  public void setUp() {
    rs.setUp();
    System.setProperty(WaybackSettings.HIBERNATE_HBM2DDL_AUTO, "create-drop");
    HibernateUtil.getSession().getSessionFactory().close();
    FileUtils.removeRecursively(TestInfo.WORKING_DIR);
    TestFileUtils.copyDirectoryNonCVS(TestInfo.ORIGINALS_DIR, TestInfo.WORKING_DIR);
    System.setProperty(
        CommonSettings.ARC_REPOSITORY_CLIENT,
        "dk.netarkivet.common.distribute.arcrepository.LocalArcRepositoryClient");
    System.setProperty(
        "settings.common.arcrepositoryClient.fileDir", TestInfo.FILE_DIR.getAbsolutePath());
    System.setProperty(
        CommonSettings.REMOTE_FILE_CLASS, "dk.netarkivet.common.distribute.TestRemoteFile");
    assertTrue(
        ArcRepositoryClientFactory.getPreservationInstance() instanceof LocalArcRepositoryClient);
  }

  @After
  public void tearDown() {
    HibernateUtil.getSession().getSessionFactory().close();
    FileUtils.removeRecursively(TestInfo.WORKING_DIR);
    if (oldClient != null) {
      System.setProperty(CommonSettings.ARC_REPOSITORY_CLIENT, oldClient);
    } else {
      System.setProperty(CommonSettings.ARC_REPOSITORY_CLIENT, "");
    }
    if (oldFileDir != null) {
      System.setProperty("settings.common.arcrepositoryClient.fileDir", oldFileDir);
    } else {
      System.setProperty("settings.common.arcrepositoryClient.fileDir", "");
    }
    rs.tearDown();
  }
}
  public void setUp() {
    mtf.setUp();
    rs.setUp();

    Settings.set(CommonSettings.BATCHJOBS_BASEDIR, TestInfo.BATCH_DIR.getAbsolutePath());
  }
コード例 #11
0
/**
 * Batchjob that extracts lines from a crawl log matching a regular expression The batch job should
 * be restricted to run on metadata files for a specific job only, using the {@link
 * #processOnlyFilesMatching(String)} construct.
 */
@SuppressWarnings({"serial"})
public class CrawlLogLinesMatchingRegexp extends ArchiveBatchJob {

  /** The logger. */
  // private final Log log = LogFactory.getLog(getClass().getName());
  private static final Logger log = LoggerFactory.getLogger(CrawlLogLinesMatchingRegexp.class);

  /** Metadata URL for crawl logs. */
  private static final String SETUP_URL_FORMAT =
      String.format(
          "metadata://%s/crawl/logs/crawl.log", Settings.get(CommonSettings.ORGANIZATION));

  /** The regular expression to match in the crawl.log line. */
  private final String regexp;

  /**
   * Initialise the batch job.
   *
   * @param regexp The regexp to match in the crawl.log lines.
   */
  public CrawlLogLinesMatchingRegexp(String regexp) {
    ArgumentNotValid.checkNotNullOrEmpty(regexp, "regexp");
    this.regexp = regexp;

    /** One week in milliseconds. */
    batchJobTimeout = 7 * Constants.ONE_DAY_IN_MILLIES;
  }

  /**
   * Does nothing, no initialisation is needed.
   *
   * @param os Not used.
   */
  @Override
  public void initialize(OutputStream os) {}

  @Override
  public ArchiveBatchFilter getFilter() {
    return new ArchiveBatchFilter("OnlyCrawlLog") {
      public boolean accept(ArchiveRecordBase record) {
        String URL = record.getHeader().getUrl();
        if (URL == null) {
          return false;
        } else {
          return URL.startsWith(SETUP_URL_FORMAT);
        }
      }
    };
  }

  /**
   * Process a record on crawl log concerning the given domain to result.
   *
   * @param record The record to process.
   * @param os The output stream for the result.
   * @throws ArgumentNotValid on null parameters
   * @throws IOFailure on trouble processing the record.
   */
  @Override
  public void processRecord(ArchiveRecordBase record, OutputStream os) {
    ArgumentNotValid.checkNotNull(record, "ArchiveRecordBase record");
    ArgumentNotValid.checkNotNull(os, "OutputStream os");
    BufferedReader arcreader = new BufferedReader(new InputStreamReader(record.getInputStream()));
    try {
      for (String line = arcreader.readLine(); line != null; line = arcreader.readLine()) {
        if (line.matches(regexp)) {
          os.write(line.getBytes("UTF-8"));
          os.write('\n');
        }
      }
    } catch (IOException e) {
      throw new IOFailure("Unable to process (w)arc record", e);
    } finally {
      try {
        arcreader.close();
      } catch (IOException e) {
        log.warn("unable to close arcreader probably", e);
      }
    }
  }

  /**
   * Does nothing, no finishing is needed.
   *
   * @param os Not used.
   */
  @Override
  public void finish(OutputStream os) {}

  @Override
  public String toString() {
    return getClass().getName()
        + ", with arguments: Regexp = "
        + regexp
        + ", Filter = "
        + getFilter();
  }
}
コード例 #12
0
  @Test
  public void testProcessMissingRequest() throws Exception {

    Settings.set(
        ArchiveSettings.DIR_ARCREPOSITORY_BITPRESERVATION, TestInfo.WORKING_DIR.getAbsolutePath());
    Settings.set(ArchiveSettings.DIRS_ARCREPOSITORY_ADMIN, TestInfo.WORKING_DIR.getAbsolutePath());

    // Ensure that a admin data exists before we start.
    AdminData.getUpdateableInstance();

    MockFileBasedActiveBitPreservation mockabp = new MockFileBasedActiveBitPreservation();
    MockHttpServletRequest request = new MockHttpServletRequest();
    String replicaID1 = "ONE";
    String replicaID2 = "TWO";
    String filename1 = "foo";
    String filename2 = "bar";
    Locale defaultLocale = new Locale("da");

    // First test a working set of params
    Map<String, String[]> args = new HashMap<String, String[]>();
    args.put(
        ADD_COMMAND,
        new String[] {
          Replica.getReplicaFromId(replicaID1).getName() + STRING_FILENAME_SEPARATOR + filename1
        });
    request.setupAddParameter(
        ADD_COMMAND,
        new String[] {
          Replica.getReplicaFromId(replicaID1).getName() + STRING_FILENAME_SEPARATOR + filename1
        });
    args.put(GET_INFO_COMMAND, new String[] {filename1});
    request.setupAddParameter(GET_INFO_COMMAND, new String[] {filename1});
    args.put(BITARCHIVE_NAME_PARAM, new String[] {Replica.getReplicaFromId(replicaID1).getName()});
    request.setupAddParameter(
        BITARCHIVE_NAME_PARAM, new String[] {Replica.getReplicaFromId(replicaID1).getName()});
    request.setupGetParameterMap(args);
    request.setupGetParameterNames(new Vector<String>(args.keySet()).elements());
    Map<String, PreservationState> status =
        BitpreserveFileState.processMissingRequest(
            WebinterfaceTestCase.getDummyPageContext(defaultLocale, request), new StringBuilder());
    assertEquals("Should have one call to reestablish", 1, mockabp.getCallCount(ADD_METHOD));
    assertEquals(
        "Should have one call to getFilePreservationStatus",
        1,
        mockabp.getCallCount(GET_INFO_METHOD));
    assertEquals("Should have one info element (with mock results)", null, status.get(filename1));

    // Check that we can call without any params
    mockabp.calls.clear();
    request = new MockHttpServletRequest();
    args.clear();
    args.put(BITARCHIVE_NAME_PARAM, new String[] {Replica.getReplicaFromId(replicaID1).getName()});
    request.setupAddParameter(
        BITARCHIVE_NAME_PARAM, new String[] {Replica.getReplicaFromId(replicaID1).getName()});
    request.setupGetParameterMap(args);
    status =
        BitpreserveFileState.processMissingRequest(
            WebinterfaceTestCase.getDummyPageContext(defaultLocale, request), new StringBuilder());
    assertEquals("Should have no call to restablish", 0, mockabp.getCallCount(ADD_METHOD));
    assertEquals(
        "Should have no call to getFilePreservationStatus",
        0,
        mockabp.getCallCount(GET_INFO_METHOD));
    assertEquals("Should have no status", 0, status.size());

    // Check that we can handle more than one call to each and that the
    // args are correct.
    mockabp.calls.clear();
    request = new MockHttpServletRequest();
    args.clear();
    args.put(BITARCHIVE_NAME_PARAM, new String[] {Replica.getReplicaFromId(replicaID2).getName()});
    request.setupAddParameter(
        BITARCHIVE_NAME_PARAM, new String[] {Replica.getReplicaFromId(replicaID2).getName()});
    request.setupAddParameter(
        ADD_COMMAND,
        new String[] {
          Replica.getReplicaFromId(replicaID2).getName() + STRING_FILENAME_SEPARATOR + filename1,
          Replica.getReplicaFromId(replicaID2).getName() + STRING_FILENAME_SEPARATOR + filename1
        });
    args.put(
        ADD_COMMAND,
        new String[] {
          Replica.getReplicaFromId(replicaID2).getName() + STRING_FILENAME_SEPARATOR + filename1,
          Replica.getReplicaFromId(replicaID2).getName() + STRING_FILENAME_SEPARATOR + filename1
        });
    request.setupAddParameter(GET_INFO_COMMAND, new String[] {filename1, filename2, filename1});
    args.put(GET_INFO_COMMAND, new String[] {filename1, filename2, filename1});
    request.setupGetParameterMap(args);
    status =
        BitpreserveFileState.processMissingRequest(
            WebinterfaceTestCase.getDummyPageContext(defaultLocale, request), new StringBuilder());
    assertEquals("Should have two calls to restablish", 2, mockabp.getCallCount(ADD_METHOD));
    assertEquals(
        "Should have three calls to getFilePreservationStatus",
        3,
        mockabp.getCallCount(GET_INFO_METHOD));
    assertEquals("Should have two info elements", 2, status.size());
    assertEquals("Should have info for filename1", null, status.get(filename1));
    assertEquals("Should have info for filename2", null, status.get(filename2));

    // Iterator<String> it = mockabp.calls.get(ADD_METHOD).iterator();
    // while (it.hasNext()) {
    // System.out.println(it.next());
    // }

    CollectionAsserts.assertIteratorEquals(
        "Should have the args given add",
        Arrays.asList(new String[] {filename1 + "," + replicaID2, filename1 + "," + replicaID2})
            .iterator(),
        mockabp.calls.get(ADD_METHOD).iterator());

    CollectionAsserts.assertIteratorEquals(
        "Should have the args given info",
        Arrays.asList(new String[] {filename1, filename2, filename1}).iterator(),
        mockabp.calls.get(GET_INFO_METHOD).iterator());
  }
/**
 * A base class for {@link JobGenerator} implementations. It is recommended to extend this class to
 * implement a new job generator.
 *
 * <p>The base algorithm iterates over domain configurations within the harvest definition, and
 * according to the configuration ({@link HarvesterSettings#JOBGEN_DOMAIN_CONFIG_SUBSET_SIZE},
 * constitutes a subset of domain configurations from which one or more jobs will be generated.
 */
abstract class AbstractJobGenerator implements JobGenerator {

  /** Logger for this class. */
  private static Log log = LogFactory.getLog(AbstractJobGenerator.class);

  /** How many domain configurations to process in one go. */
  private final long DOMAIN_CONFIG_SUBSET_SIZE =
      Settings.getLong(HarvesterSettings.JOBGEN_DOMAIN_CONFIG_SUBSET_SIZE);

  /** Is deduplication enabled or disabled. * */
  private final boolean DEDUPLICATION_ENABLED =
      Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED);

  @Override
  public int generateJobs(HarvestDefinition harvest) {
    log.info("Generating jobs for harvestdefinition # " + harvest.getOid());
    int jobsMade = 0;
    final Iterator<DomainConfiguration> domainConfigurations = harvest.getDomainConfigurations();

    while (domainConfigurations.hasNext()) {
      List<DomainConfiguration> subset = new ArrayList<DomainConfiguration>();
      while (domainConfigurations.hasNext() && subset.size() < DOMAIN_CONFIG_SUBSET_SIZE) {
        subset.add(domainConfigurations.next());
      }

      Collections.sort(subset, getDomainConfigurationSubsetComparator(harvest));
      if (log.isTraceEnabled()) {
        log.trace(
            subset.size()
                + " domainconfigs now sorted and ready to processing "
                + "for harvest #"
                + harvest.getOid());
      }
      jobsMade += processDomainConfigurationSubset(harvest, subset.iterator());
    }
    harvest.setNumEvents(harvest.getNumEvents() + 1);

    if (!harvest.isSnapShot()) {
      PartialHarvest focused = (PartialHarvest) harvest;
      Schedule schedule = focused.getSchedule();
      int numEvents = harvest.getNumEvents();

      // Calculate next event
      Date now = new Date();
      Date nextEvent = schedule.getNextEvent(focused.getNextDate(), numEvents);

      // Refuse to schedule event in the past
      if (nextEvent != null && nextEvent.before(now)) {
        int eventsSkipped = 0;
        while (nextEvent != null && nextEvent.before(now)) {
          nextEvent = schedule.getNextEvent(nextEvent, numEvents);
          eventsSkipped++;
        }
        if (log.isWarnEnabled()) {
          log.warn(
              "Refusing to schedule harvest definition '"
                  + harvest.getName()
                  + "' in the past. Skipped "
                  + eventsSkipped
                  + " events. Old nextDate was "
                  + focused.getNextDate()
                  + " new nextDate is "
                  + nextEvent);
        }
      }

      // Set next event
      focused.setNextDate(nextEvent);
      if (log.isTraceEnabled()) {
        log.trace(
            "Next event for harvest definition "
                + harvest.getName()
                + " happens: "
                + (nextEvent == null ? "Never" : nextEvent.toString()));
      }
    }

    log.info(
        "Finished generating " + jobsMade + " jobs for harvestdefinition # " + harvest.getOid());
    return jobsMade;
  }

  /**
   * Instantiates a new job.
   *
   * @param cfg the {@link DomainConfiguration} being processed
   * @param harvest the {@link HarvestDefinition} being processed
   * @return an instance of {@link Job}
   */
  public static Job getNewJob(HarvestDefinition harvest, DomainConfiguration cfg) {
    HarvestChannelDAO harvestChannelDao = HarvestChannelDAO.getInstance();
    HarvestChannel channel = harvestChannelDao.getChannelForHarvestDefinition(harvest.getOid());
    if (channel == null) {
      log.info(
          "No channel mapping registered for harvest id "
              + harvest.getOid()
              + ", will use default.");
      channel = harvestChannelDao.getDefaultChannel(harvest.isSnapShot());
    }
    if (harvest.isSnapShot()) {
      return Job.createSnapShotJob(
          harvest.getOid(),
          channel,
          cfg,
          harvest.getMaxCountObjects(),
          harvest.getMaxBytes(),
          ((FullHarvest) harvest).getMaxJobRunningTime(),
          harvest.getNumEvents());
    }
    return Job.createJob(harvest.getOid(), channel, cfg, harvest.getNumEvents());
  }

  /**
   * Returns a comparator used to sort the subset of {@link #DOMAIN_CONFIG_SUBSET_SIZE}
   * configurations that are scanned at each iteration.
   *
   * @param harvest the {@link HarvestDefinition} being processed.
   * @return a comparator
   */
  protected abstract Comparator<DomainConfiguration> getDomainConfigurationSubsetComparator(
      HarvestDefinition harvest);

  /**
   * Create new jobs from a collection of configurations. All configurations must use the same
   * order.xml file.Jobs
   *
   * @param harvest the {@link HarvestDefinition} being processed.
   * @param domainConfSubset the configurations to use to create the jobs
   * @return The number of jobs created
   * @throws ArgumentNotValid if any of the parameters is null or if the cfglist does not contain
   *     any configurations
   */
  protected abstract int processDomainConfigurationSubset(
      HarvestDefinition harvest, Iterator<DomainConfiguration> domainConfSubset);

  @Override
  public boolean canAccept(Job job, DomainConfiguration cfg) {
    if (!checkAddDomainConfInvariant(job, cfg)) {
      return false;
    }
    return checkSpecificAcceptConditions(job, cfg);
  }

  /**
   * Called by {@link #canAccept(Job, DomainConfiguration)}. Tests the implementation-specific
   * conditions to accept the given {@link DomainConfiguration} in the given {@link Job}. It is
   * assumed that {@link #checkAddDomainConfInvariant(Job, DomainConfiguration)} has already passed.
   *
   * @param job the {@link Job} n=being built
   * @param cfg the {@link DomainConfiguration} to test
   * @return true if the configuration passes the conditions.
   */
  protected abstract boolean checkSpecificAcceptConditions(Job job, DomainConfiguration cfg);

  /**
   * Once the job has been filled with {@link DomainConfiguration}s, performs the following
   * operations:
   *
   * <ol>
   *   <li>Edit the harvest template to add/remove deduplicator configuration.
   *   <li>
   * </ol>
   *
   * @param job the job
   */
  protected void editJobOrderXml(Job job) {
    Document doc = job.getOrderXMLdoc();
    if (DEDUPLICATION_ENABLED) {
      // Check that the Deduplicator element is present in the
      // OrderXMl and enabled. If missing or disabled log a warning
      if (!HeritrixTemplate.isDeduplicationEnabledInTemplate(doc)) {
        if (log.isWarnEnabled()) {
          log.warn(
              "Unable to perform deduplication for this job"
                  + " as the required DeDuplicator element is "
                  + "disabled or missing from template");
        }
      }
    } else {
      // Remove deduplicator Element from OrderXML if present
      Node xpathNode = doc.selectSingleNode(HeritrixTemplate.DEDUPLICATOR_XPATH);
      if (xpathNode != null) {
        xpathNode.detach();
        job.setOrderXMLDoc(doc);
        if (log.isInfoEnabled()) {
          log.info("Removed DeDuplicator element because " + "Deduplication is disabled");
        }
      }
    }
  }

  /**
   * Tests that:
   *
   * <ol>
   *   <li>The given domain configuration and job are not null.
   *   <li>The job does not already contain the given domain configuration.
   *   <li>The domain configuration has the same order xml name as the first inserted domain config.
   * </ol>
   *
   * @param job a given Job
   * @param cfg a given DomainConfiguration
   * @return true, if the given DomainConfiguration can be inserted into the given job
   */
  private boolean checkAddDomainConfInvariant(Job job, DomainConfiguration cfg) {
    ArgumentNotValid.checkNotNull(job, "job");
    ArgumentNotValid.checkNotNull(cfg, "cfg");

    // check if domain in DomainConfiguration cfg is not already in this job
    // domainName is used as key in domainConfigurationMap
    if (job.getDomainConfigurationMap().containsKey(cfg.getDomainName())) {
      if (log.isDebugEnabled()) {
        log.debug("Job already has a configuration for Domain '" + cfg.getDomainName() + "'.");
      }
      return false;
    }

    // check if template is same as this job.
    String orderXMLname = job.getOrderXMLName();
    if (!orderXMLname.equals(cfg.getOrderXmlName())) {
      if (log.isDebugEnabled()) {
        log.debug(
            "This Job only accept configurations "
                + "using the harvest template '"
                + orderXMLname
                + "'. This configuration uses the harvest template '"
                + cfg.getOrderXmlName()
                + "'.");
      }
      return false;
    }

    return true;
  }
}
/**
 * This class implements a generator for an history chart of a running job. The chart traces the
 * progress percentage and the queued URI count over the crawl time. Charts are rendered in a PNG
 * image file, generated in the webapp directory.
 */
class StartedJobHistoryChartGen {

  /** Time units used to scale the crawl time values and generate the chart's time axis ticks. */
  protected static enum TimeAxisResolution {
    /** One second. Tick step is 10s. */
    second(1, 1, 10),
    /** One minute. Tick step is 5m. */
    minute(60, 60, 5),
    /** One hour. Tick step is 1h. */
    hour(60 * minute.seconds, 60 * minute.seconds, 1),
    /** Twelve hours. Tick step is 2h. */
    half_day(12 * 60 * minute.seconds, 60 * minute.seconds, 2),
    /** One day. Tick step is 0.5d. */
    day(24 * hour.seconds, 24 * hour.seconds, 0.5),
    /** One week. Tick step is 1w. */
    week(7 * day.seconds, 7 * day.seconds, 1);

    /** The time unit in seconds. */
    private final int seconds;

    /** The scale in seconds. */
    private final int scaleSeconds;

    /** The step between two tick units. */
    private final double tickStep;

    /**
     * Builds a time axis resolution.
     *
     * @param seconds the actual resolution in seconds
     * @param scaleSeconds the actual "scale" of ticks
     * @param tickStep the number of ticks in one step.
     */
    TimeAxisResolution(int seconds, int scaleSeconds, double tickStep) {
      this.seconds = seconds;
      this.scaleSeconds = scaleSeconds;
      this.tickStep = tickStep;
    }

    /**
     * Scale down an array of seconds.
     *
     * @param timeInSeconds An array of seconds
     * @return a scaled down version of the given array of seconds
     */
    double[] scale(double[] timeInSeconds) {
      double[] scaledTime = new double[timeInSeconds.length];
      for (int i = 0; i < timeInSeconds.length; i++) {
        scaledTime[i] = timeInSeconds[i] / this.scaleSeconds;
      }
      return scaledTime;
    }

    /**
     * @param seconds the seconds
     * @return the proper timeUnit for the given argument
     */
    static TimeAxisResolution findTimeUnit(double seconds) {

      TimeAxisResolution[] allTus = values();
      for (int i = 0; i < allTus.length - 1; i++) {
        TimeAxisResolution nextGreater = allTus[i + 1];
        if (seconds < nextGreater.seconds) {
          return allTus[i];
        }
      }
      return week; // largest unit
    }
  }

  /** A chart generation task. Generates a PNG image for a job progress history. */
  private static class ChartGen implements Runnable {
    /** The process that generates the Charts. */
    private final StartedJobHistoryChartGen gen;

    /**
     * Constructor of a ChartGen objector.
     *
     * @param gen the process that generates the charts.
     */
    ChartGen(StartedJobHistoryChartGen gen) {
      super();
      this.gen = gen;
    }

    @Override
    public void run() {

      synchronized (gen) {
        gen.chartFile = null;
      }

      long jobId = gen.jobId;

      StartedJobInfo[] fullHistory = RunningJobsInfoDAO.getInstance().getFullJobHistory(jobId);

      LinkedList<Double> timeValues = new LinkedList<Double>();
      LinkedList<Double> progressValues = new LinkedList<Double>();
      LinkedList<Double> urlValues = new LinkedList<Double>();

      for (StartedJobInfo sji : fullHistory) {
        timeValues.add((double) sji.getElapsedSeconds());
        progressValues.add(sji.getProgress());
        urlValues.add((double) sji.getQueuedFilesCount());
      }

      // Refresh the history png image for the job.
      File pngFile = new File(gen.outputFolder, jobId + "-history.png");

      File newPngFile;
      try {
        newPngFile =
            File.createTempFile(jobId + "-history", "." + System.currentTimeMillis() + ".png");
      } catch (IOException e) {
        LOG.warn("Failed to create temp PNG file for job " + jobId);
        return;
      }

      long startTime = System.currentTimeMillis();
      gen.generatePngChart(
          newPngFile,
          CHART_RESOLUTION[0],
          CHART_RESOLUTION[1],
          null, // no chart title
          I18N.getString(gen.locale, "running.job.details.chart.legend.crawlTime"),
          new String[] {
            I18N.getString(gen.locale, "running.job.details.chart.legend.progress"),
            I18N.getString(gen.locale, "running.job.details.chart.legend.queuedUris")
          },
          NumberUtils.toPrimitiveArray(timeValues),
          new double[][] {new double[] {0, 100}, null},
          new double[][] {
            NumberUtils.toPrimitiveArray(progressValues), NumberUtils.toPrimitiveArray(urlValues)
          },
          new Color[] {Color.blue, Color.green.darker()},
          new String[] {"%", ""},
          false,
          Color.lightGray.brighter().brighter());

      long genTime = System.currentTimeMillis() - startTime;
      LOG.info(
          "Generated history chart for job "
              + jobId
              + " in "
              + (genTime < TimeUtils.SECOND_IN_MILLIS
                  ? genTime + " ms"
                  : StringUtils.formatDuration(genTime / TimeUtils.SECOND_IN_MILLIS))
              + ".");

      synchronized (gen) {
        // Overwrite old file, then delete temp file
        try {
          FileUtils.copyFile(newPngFile, pngFile);
          FileUtils.remove(newPngFile);
        } catch (IOFailure iof) {
          LOG.error("IOFailure while copying PNG file", iof);
        }
        gen.chartFile = pngFile;
      }
    }
  }

  /** The class logger. */
  static final Log LOG = LogFactory.getLog(StartedJobHistoryChartGen.class);

  /** Internationalisation object. */
  private static final I18n I18N = new I18n(dk.netarkivet.harvester.Constants.TRANSLATIONS_BUNDLE);

  /** Rate in seconds at which history charts should be generated. */
  private static final long GEN_INTERVAL =
      Settings.getLong(HarvesterSettings.HARVEST_MONITOR_HISTORY_CHART_GEN_INTERVAL);

  /** The chart image resolution. */
  private static final int[] CHART_RESOLUTION = new int[] {600, 450};
  /** The dimension of the chart axis. */
  private static final double CHART_AXIS_DIMENSION = 10.0;
  /** The relative path of the output. */
  private static final String OUTPUT_REL_PATH = "History" + File.separator + "webapp";

  /** The job id. */
  private final long jobId;

  /** The folder where image files are output. */
  private final File outputFolder;

  /** The chart image file. */
  private File chartFile = null;

  /** The locale for internationalizing the chart. The locale is set to the system default. */
  private final Locale locale;
  /** The process controlling the cyclic regeneration of charts. */
  private PeriodicTaskExecutor genExec = null;

  /**
   * Constructor. Start generating charts for data belonging to the given job.
   *
   * @param jobId a job id.
   */
  StartedJobHistoryChartGen(long jobId) {
    super();

    this.outputFolder = new File(FileUtils.getTempDir() + File.separator + OUTPUT_REL_PATH);

    this.jobId = jobId;

    // Set the locale to the system default
    this.locale = Locale.getDefault();

    genExec = new PeriodicTaskExecutor("ChartGen", new ChartGen(this), 0, GEN_INTERVAL);
  }

  /**
   * Returns the image file.
   *
   * @return the image file. Might return null if no file is currently available.
   */
  public synchronized File getChartFile() {
    return chartFile;
  }

  /** Deletes the chart image if it exists and stops the generation schedule. */
  public void cleanup() {

    if (chartFile != null && chartFile.exists()) {
      if (!chartFile.delete()) {
        chartFile.deleteOnExit();
      }
    }

    if (genExec != null) {
      genExec.shutdown();
    }
  }

  /**
   * Generates a chart in PNG format.
   *
   * @param outputFile the output file, it should exist.
   * @param pxWidth the image width in pixels.
   * @param pxHeight the image height in pixels.
   * @param chartTitle the chart title, may be null.
   * @param xAxisTitle the x axis title
   * @param yDataSeriesRange the axis range (null for auto)
   * @param yDataSeriesTitles the Y axis titles.
   * @param timeValuesInSeconds the time values in seconds
   * @param yDataSeries the Y axis value series.
   * @param yDataSeriesColors the Y axis value series drawing colors.
   * @param yDataSeriesTickSuffix TODO explain argument yDataSeriesTickSuffix
   * @param drawBorder draw, or not, the border.
   * @param backgroundColor the chart background color.
   */
  final void generatePngChart(
      File outputFile,
      int pxWidth,
      int pxHeight,
      String chartTitle,
      String xAxisTitle,
      String[] yDataSeriesTitles,
      double[] timeValuesInSeconds,
      double[][] yDataSeriesRange,
      double[][] yDataSeries,
      Color[] yDataSeriesColors,
      String[] yDataSeriesTickSuffix,
      boolean drawBorder,
      Color backgroundColor) {

    // Domain axis
    NumberAxis xAxis = new NumberAxis(xAxisTitle);
    xAxis.setFixedDimension(CHART_AXIS_DIMENSION);
    xAxis.setLabelPaint(Color.black);
    xAxis.setTickLabelPaint(Color.black);

    double maxSeconds = getMaxValue(timeValuesInSeconds);
    TimeAxisResolution xAxisRes = TimeAxisResolution.findTimeUnit(maxSeconds);
    xAxis.setTickUnit(new NumberTickUnit(xAxisRes.tickStep));
    double[] scaledTimeValues = xAxisRes.scale(timeValuesInSeconds);

    String tickSymbol =
        I18N.getString(locale, "running.job.details.chart.timeunit.symbol." + xAxisRes.name());
    xAxis.setNumberFormatOverride(new DecimalFormat("###.##'" + tickSymbol + "'"));

    // First dataset
    String firstDataSetTitle = yDataSeriesTitles[0];
    XYDataset firstDataSet = createXYDataSet(firstDataSetTitle, scaledTimeValues, yDataSeries[0]);
    Color firstDataSetColor = yDataSeriesColors[0];

    // First range axis
    NumberAxis firstYAxis = new NumberAxis(firstDataSetTitle);

    firstYAxis.setFixedDimension(CHART_AXIS_DIMENSION);
    setAxisRange(firstYAxis, yDataSeriesRange[0]);
    firstYAxis.setLabelPaint(firstDataSetColor);
    firstYAxis.setTickLabelPaint(firstDataSetColor);
    String firstAxisTickSuffix = yDataSeriesTickSuffix[0];
    if (firstAxisTickSuffix != null && !firstAxisTickSuffix.isEmpty()) {
      firstYAxis.setNumberFormatOverride(new DecimalFormat("###.##'" + firstAxisTickSuffix + "'"));
    }

    // Create the plot with domain axis and first range axis
    XYPlot plot = new XYPlot(firstDataSet, xAxis, firstYAxis, null);

    XYLineAndShapeRenderer firstRenderer = new XYLineAndShapeRenderer(true, false);
    plot.setRenderer(firstRenderer);

    plot.setOrientation(PlotOrientation.VERTICAL);
    plot.setBackgroundPaint(Color.lightGray);
    plot.setDomainGridlinePaint(Color.white);
    plot.setRangeGridlinePaint(Color.white);

    plot.setAxisOffset(new RectangleInsets(5.0, 5.0, 5.0, 5.0));
    firstRenderer.setSeriesPaint(0, firstDataSetColor);

    // Now iterate on next axes
    for (int i = 1; i < yDataSeries.length; i++) {
      // Create axis
      String seriesTitle = yDataSeriesTitles[i];
      Color seriesColor = yDataSeriesColors[i];
      NumberAxis yAxis = new NumberAxis(seriesTitle);

      yAxis.setFixedDimension(CHART_AXIS_DIMENSION);
      setAxisRange(yAxis, yDataSeriesRange[i]);

      yAxis.setLabelPaint(seriesColor);
      yAxis.setTickLabelPaint(seriesColor);

      String yAxisTickSuffix = yDataSeriesTickSuffix[i];
      if (yAxisTickSuffix != null && !yAxisTickSuffix.isEmpty()) {
        yAxis.setNumberFormatOverride(new DecimalFormat("###.##'" + yAxisTickSuffix + "'"));
      }

      // Create dataset and add axis to plot
      plot.setRangeAxis(i, yAxis);
      plot.setRangeAxisLocation(i, AxisLocation.BOTTOM_OR_LEFT);
      plot.setDataset(i, createXYDataSet(seriesTitle, scaledTimeValues, yDataSeries[i]));
      plot.mapDatasetToRangeAxis(i, i);
      XYItemRenderer renderer = new StandardXYItemRenderer();
      renderer.setSeriesPaint(0, seriesColor);
      plot.setRenderer(i, renderer);
    }

    // Create the chart
    JFreeChart chart = new JFreeChart(chartTitle, JFreeChart.DEFAULT_TITLE_FONT, plot, false);

    // Customize rendering
    chart.setBackgroundPaint(Color.white);
    chart.setBorderVisible(true);
    chart.setBorderPaint(Color.BLACK);

    // Render image
    try {
      ChartUtilities.saveChartAsPNG(outputFile, chart, pxWidth, pxHeight);
    } catch (IOException e) {
      LOG.error("Chart export failed", e);
    }
  }

  /**
   * Create a XYDataset based on the given arguments.
   *
   * @param name The name
   * @param timeValues The timevalues
   * @param values the values
   * @return a DefaultXYDataset.
   */
  private XYDataset createXYDataSet(String name, double[] timeValues, double[] values) {

    DefaultXYDataset ds = new DefaultXYDataset();
    ds.addSeries(name, new double[][] {timeValues, values});

    return ds;
  }

  /**
   * Find the maximum of the values given. If this maximum is less than {@link Double#MIN_VALUE}
   * then {@link Double#MIN_VALUE} is returned.
   *
   * @param values an array of doubles
   * @return the maximum of the values given
   */
  private double getMaxValue(double[] values) {
    double max = Double.MIN_VALUE;
    for (double v : values) {
      max = Math.max(v, max);
    }
    return max;
  }

  /**
   * Set the axis range.
   *
   * @param axis a numberAxis
   * @param range a range
   */
  private void setAxisRange(NumberAxis axis, double[] range) {
    if (range == null || range.length != 2) {
      axis.setAutoRange(true);
    } else {
      double lower = range[0];
      double upper = range[1];
      ArgumentNotValid.checkTrue(lower < upper, "Incorrect range");
      axis.setAutoRange(false);
      axis.setRange(new Range(lower, upper));
    }
  }
}
コード例 #15
0
  /**
   * Package private constructor for common initialisation.
   *
   * @param harvestID the id of the harvestdefinition
   * @param cfg the configuration to base the Job on
   * @param orderXMLdoc
   * @param channel the channel on which the job will be submitted.
   * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain,
   *     overrides individual configuration settings. -1 means no limit
   * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for
   *     no limit.
   * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
   * @param harvestNum the run number of the harvest definition
   * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit <
   *     -1
   */
  public Job(
      Long harvestID,
      DomainConfiguration cfg,
      HeritrixTemplate orderXMLdoc,
      HarvestChannel channel,
      long forceMaxObjectsPerDomain,
      long forceMaxBytesPerDomain,
      long forceMaxJobRunningTime,
      int harvestNum)
      throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(cfg, "cfg");
    ArgumentNotValid.checkNotNull(harvestID, "harvestID");
    ArgumentNotValid.checkNotNegative(harvestID, "harvestID");
    ArgumentNotValid.checkNotNull(channel, "channel");

    if (forceMaxObjectsPerDomain < -1) {
      String msg = "forceMaxObjectsPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }
    if (forceMaxBytesPerDomain < -1) {
      String msg = "forceMaxBytesPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }

    if (forceMaxBytesPerDomain == 0L) {
      log.warn(
          "forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain");
    }

    if (forceMaxObjectsPerDomain == 0L) {
      log.warn(
          "forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain");
    }

    // setup initial members
    domainConfigurationMap = new HashMap<>();
    origHarvestDefinitionID = harvestID;
    orderXMLname = cfg.getOrderXmlName();
    this.orderXMLdoc = orderXMLdoc;

    setHarvestChannel(channel);

    long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects());
    setMaxObjectsPerDomain(maxObjects);
    configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain);

    long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes());
    setMaxBytesPerDomain(maxBytes);
    configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain);

    long expectation =
        cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
    maxCountObjects = expectation;
    minCountObjects = expectation;
    this.harvestNum = harvestNum;

    addConfiguration(cfg);

    setMaxJobRunningTime(forceMaxJobRunningTime);

    setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT));

    setAttributes(cfg.getAttributesAndTypes());

    orderXMLdoc.enableOrDisableDeduplication(
        Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED));

    status = JobStatus.NEW;
  }
コード例 #16
0
/**
 * This class represents one job to run by Heritrix. It's based on a number of configurations all
 * based on the same order.xml and at most one configuration for each domain. Each job consists of
 * configurations of the approximate same size; that is the difference in expectation from the
 * smallest configuration to the largest configuration is within a factor of each other defined as
 * limMaxRelSize (although differences smaller than limMinAbsSize are ignored) There is a limit
 * limMaxTotalSize on the total size of the job in objects.
 *
 * <p>A job may also be limited on bytes or objects, defined either by the configurations in the job
 * or the harvest definition the job is generated by.
 *
 * <p>The job contains the order file, the seedlist and the current status of the job, as well as
 * the ID of the harvest definition that defined it and names of all the configurations it is based
 * on.
 */
@SuppressWarnings({"serial"})
public class Job implements Serializable, JobInfo {
  private static final transient Logger log = LoggerFactory.getLogger(Job.class);

  // Persistent fields stored in and read from DAO
  /** The persistent ID of this job. */
  private Long jobID;
  /** The Id of the harvestdefinition, that generated this job. */
  protected Long origHarvestDefinitionID;
  /** The status of the job. See the JobStatus class for the possible states. */
  protected JobStatus status;
  /** The name of the {@link HarvestChannel} on which this job will be posted. */
  private String channel;

  /** Whether the job belongs to a snapshot or partial harvest. */
  private boolean isSnapshot;
  /**
   * Overrides the individual configurations maximum setting for objects retrieved from a domain
   * when set to a positive value.
   */
  private long forceMaxObjectsPerDomain = Constants.HERITRIX_MAXOBJECTS_INFINITY;
  /**
   * Overrides the individual configurations maximum setting for bytes retrieved from a domain when
   * set to other than -1.
   */
  private long forceMaxBytesPerDomain = Constants.HERITRIX_MAXBYTES_INFINITY;
  /** The name of the harvest template used by the job. */
  private String orderXMLname;
  /** The harvest template used by the job. */
  private HeritrixTemplate orderXMLdoc;
  /** The list of Heritrix settings files. */
  private File[] settingsXMLfiles;

  /** The corresponding Dom4j Documents for these files. */
  // private Document[] settingsXMLdocs;

  /**
   * A set of seeds involved in this job. Outside the SetSeedList() method, the set of seeds is
   * updated in the addConfiguration() method.
   */
  private Set<String> seedListSet = new HashSet<String>();
  /** Which run of the harvest definition this is. */
  private int harvestNum;
  /** Errors during harvesting. */
  private String harvestErrors;
  /** Details about errors during harvesting. */
  private String harvestErrorDetails;
  /** Errors during upload of the harvested data. */
  private String uploadErrors;
  /** Details about errors during upload of the harvested data. */
  private String uploadErrorDetails;
  /** The starting point of the job. */
  private Date actualStart;
  /** The ending point of the job. */
  private Date actualStop;
  /** The time when this job was submitted. */
  private Date submittedDate;
  /** The time when this job was created. */
  private Date creationDate;

  /** Edition is used by the DAO to keep track of changes. */
  private long edition = -1;

  /** Resubmitted as the Job with this ID. If null, this job has not been resubmitted. */
  private Long resubmittedAsJobWithID;

  /** Continuation of this job. */
  private Long continuationOF;

  /**
   * A map (domainName, domainConfigurationName), must be accessible in order to update job
   * information (see Ass. 2.4.3)
   */
  private Map<String, String> domainConfigurationMap;
  /**
   * A hint to the DAO that configurations have changed. Since configurations are large, the DAO can
   * use that this is false to avoid updating the config list. The DAO can set it to false after
   * saving configurations.
   */
  boolean configsChanged = false;

  // Intermediate fields, non-persistent and only used while building objects

  /**
   * Whether the maxObjects field was defined by the harvest definition or the configuration limit.
   * This is deciding for whether we accept smaller configurations or not when building jobs. True
   * means the limit is defined by the configuration, false means that it is defined by the harvest
   * definition.
   */
  private boolean configurationSetsObjectLimit;

  /**
   * Whether the maxBytes field was defined by the harvest definition or the configuration limit.
   * This is deciding for whether we accept smaller configurations or not when building jobs. True
   * means the limit is defined by the configuration, false means by the harvest definition.
   */
  private boolean configurationSetsByteLimit;

  /** The lowest number of objects expected by a configuration. */
  private long minCountObjects;

  /** The highest number of objects expected by a configuration. */
  private long maxCountObjects;

  /** The total number of objects expected by all added configurations. */
  private long totalCountObjects;

  /** The max time in seconds given to the harvester for this job. 0 is unlimited. */
  private long forceMaxRunningTime;

  /**
   * If true, this job object is still undergoing changes due to having more configurations added.
   * When set to false, the object is no longer considered immutable except for updating status.
   *
   * <p>Jobs loaded from the DAO are never under construction anymore.
   */
  private boolean underConstruction = true;

  // Constants

  // Note: The following constants are intentionally left non-static for easy
  // unit testing

  private boolean maxObjectsIsSetByQuotaEnforcer =
      Settings.getBoolean(HarvesterSettings.OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER);

  /**
   * The harvestname prefix used in the files generated by Heritrix. Is set using an
   * ArchiveFileNaming class when the jobID is available.
   */
  private String harvestnamePrefix;

  /** This variable is right now the same as harvestdefinitions.audience field. */
  private String harvestAudience;

  protected Job() {
    this.status = JobStatus.NEW;
  }

  /**
   * Package private constructor for common initialisation.
   *
   * @param harvestID the id of the harvestdefinition
   * @param cfg the configuration to base the Job on
   * @param orderXMLdoc
   * @param channel the channel on which the job will be submitted.
   * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain,
   *     overrides individual configuration settings. -1 means no limit
   * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for
   *     no limit.
   * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
   * @param harvestNum the run number of the harvest definition
   * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit <
   *     -1
   */
  public Job(
      Long harvestID,
      DomainConfiguration cfg,
      HeritrixTemplate orderXMLdoc,
      HarvestChannel channel,
      long forceMaxObjectsPerDomain,
      long forceMaxBytesPerDomain,
      long forceMaxJobRunningTime,
      int harvestNum)
      throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(cfg, "cfg");
    ArgumentNotValid.checkNotNull(harvestID, "harvestID");
    ArgumentNotValid.checkNotNegative(harvestID, "harvestID");
    ArgumentNotValid.checkNotNull(channel, "channel");

    if (forceMaxObjectsPerDomain < -1) {
      String msg = "forceMaxObjectsPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }
    if (forceMaxBytesPerDomain < -1) {
      String msg = "forceMaxBytesPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }

    if (forceMaxBytesPerDomain == 0L) {
      log.warn(
          "forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain");
    }

    if (forceMaxObjectsPerDomain == 0L) {
      log.warn(
          "forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain");
    }

    // setup initial members
    domainConfigurationMap = new HashMap<>();
    origHarvestDefinitionID = harvestID;
    orderXMLname = cfg.getOrderXmlName();
    this.orderXMLdoc = orderXMLdoc;

    setHarvestChannel(channel);

    long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects());
    setMaxObjectsPerDomain(maxObjects);
    configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain);

    long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes());
    setMaxBytesPerDomain(maxBytes);
    configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain);

    long expectation =
        cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
    maxCountObjects = expectation;
    minCountObjects = expectation;
    this.harvestNum = harvestNum;

    addConfiguration(cfg);

    setMaxJobRunningTime(forceMaxJobRunningTime);

    setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT));

    setAttributes(cfg.getAttributesAndTypes());

    orderXMLdoc.enableOrDisableDeduplication(
        Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED));

    status = JobStatus.NEW;
  }

  public void setAttributes(List<AttributeAndType> attributesAndTypes) {
    orderXMLdoc.insertAttributes(attributesAndTypes);
  }

  /** Update the order template according to the chosen archive format (arc/warc). */
  private void setArchiveFormatInTemplate(String archiveFormat) {
    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }
    orderXMLdoc.setArchiveFormat(archiveFormat);
  }

  /**
   * Create a new Job object from basic information stored in the DAO.
   *
   * @param harvestID the id of the harvestdefinition
   * @param configurations the configurations to base the Job on
   * @param channel the name of the channel on which the job will be submitted.
   * @param snapshot whether the job belongs to a snapshot harvest
   * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain,
   *     overrides individual configuration settings. 0 means no limit.
   * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for
   *     no limit.
   * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
   * @param status the current status of the job.
   * @param orderXMLname the name of the order template used.
   * @param orderXMLdoc the (possibly modified) template
   * @param seedlist the combined seedlist from all configs.
   * @param harvestNum the run number of the harvest definition
   */
  Job(
      Long harvestID,
      Map<String, String> configurations,
      String channel,
      boolean snapshot,
      long forceMaxObjectsPerDomain,
      long forceMaxBytesPerDomain,
      long forceMaxJobRunningTime,
      JobStatus status,
      String orderXMLname,
      HeritrixTemplate orderXMLdoc,
      String seedlist,
      int harvestNum,
      Long continuationOf) {
    origHarvestDefinitionID = harvestID;
    domainConfigurationMap = configurations;
    this.channel = channel;
    this.isSnapshot = snapshot;
    this.forceMaxBytesPerDomain = forceMaxBytesPerDomain;
    this.forceMaxObjectsPerDomain = forceMaxObjectsPerDomain;
    this.forceMaxRunningTime = forceMaxJobRunningTime;
    this.status = status;
    this.orderXMLname = orderXMLname;
    this.orderXMLdoc = orderXMLdoc;
    this.setSeedList(seedlist);
    this.harvestNum = harvestNum;
    this.continuationOF = continuationOf;

    underConstruction = false;
  }

  /**
   * Adds a configuration to this Job. Seedlists and settings are updated accordingly.
   *
   * @param cfg the configuration to add
   * @throws ArgumentNotValid if cfg is null or cfg uses a different orderxml than this job or if
   *     this job already contains a configuration associated with domain of configuration cfg.
   */
  public void addConfiguration(DomainConfiguration cfg) {
    ArgumentNotValid.checkNotNull(cfg, "cfg");
    if (domainConfigurationMap.containsKey(cfg.getDomainName())) {
      throw new ArgumentNotValid(
          "Job already has a configuration for Domain " + cfg.getDomainName());
    }

    if (log.isTraceEnabled()) {
      log.trace("Adding configuration '{}' to job '{}'", cfg, cfg.getName());
    }

    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }

    if (!cfg.getOrderXmlName().equals(getOrderXMLName())) {
      throw new ArgumentNotValid(
          "Job requires the orderxml file:'"
              + getOrderXMLName()
              + "' not:'"
              + cfg.getOrderXmlName()
              + "' used by the configuration:'"
              + cfg.getName());
    }

    domainConfigurationMap.put(cfg.getDomainName(), cfg.getName());

    // Add the seeds from the configuration to the Job seeds.
    // Take care of duplicates.
    for (Iterator<SeedList> itt = cfg.getSeedLists(); itt.hasNext(); ) {
      SeedList seed = itt.next();
      List<String> seeds = seed.getSeeds();
      for (String seedUrl : seeds) {
        seedListSet.add(seedUrl); // duplicates is silently ignored

        // TODO remove when heritrix implements this functionality
        // try to convert a seed into a Internationalized Domain Name
        try {
          String seedASCII = seedUrl;
          // It is rare to see these seeds, but they need to be
          // correctly idnaized
          if (seedUrl.contains(":") || seedUrl.contains("/")) {
            String normalizedUrl = seedUrl;
            if (!normalizedUrl.matches("^[a-zA-Z]+:.*")) {
              // If no protocol is given, assume http
              normalizedUrl = "http://" + normalizedUrl;
            }
            URL url = new URL(normalizedUrl);
            String domainName = url.getHost();
            String domainNameASCII = IDNA.toASCII(domainName);
            if (!domainName.equals(domainNameASCII)) {
              // If the domain name changed, replace that in the
              // seed.
              seedASCII = seedUrl.replaceFirst(Pattern.quote(domainName), domainNameASCII);
            }
          } else {
            seedASCII = IDNA.toASCII(seedUrl);
          }
          if (!seedASCII.equals(seedUrl)) {
            log.trace("Converted {} to {}", seedUrl, seedASCII);
            // Note that duplicates is silently ignored
            seedListSet.add(seedASCII);
          }
        } catch (IDNAException e) {
          log.trace("Cannot convert seed {} to ASCII", seedUrl, e);
        } catch (MalformedURLException e) {
          log.trace("Cannot convert seed {} to ASCII", seedUrl, e);
        }
      }
    }

    orderXMLdoc.editOrderXMLAddPerDomainCrawlerTraps(cfg);

    // TODO update limits in settings files - see also bug 269

    // Update estimates of job size
    long expectation =
        cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
    maxCountObjects = Math.max(expectation, maxCountObjects);
    minCountObjects = Math.min(expectation, minCountObjects);
    totalCountObjects += expectation;

    configsChanged = true;

    assert (maxCountObjects >= minCountObjects) : "basic invariant";
  }

  /**
   * Get the name of the order XML file used by this Job.
   *
   * @return the name of the orderXML file
   */
  public String getOrderXMLName() {
    return orderXMLname;
  }

  /**
   * Get the actual time when this job was stopped/completed.
   *
   * @return the time as Date
   */
  public Date getActualStop() {
    return actualStop;
  }

  /**
   * Get the actual time when this job was started.
   *
   * @return the time as Date
   */
  public Date getActualStart() {
    return actualStart;
  }

  /**
   * Get the time when this job was submitted.
   *
   * @return the time as Date
   */
  public Date getSubmittedDate() {
    return submittedDate;
  }

  /**
   * Get the time when this job was created.
   *
   * @return the creation time as a <code>Date</code>
   */
  public Date getCreationDate() {
    return creationDate;
  }

  /**
   * Get a list of Heritrix settings.xml files. Note that these files have nothing to do with
   * NetarchiveSuite settings files. They are files that supplement the Heritrix order.xml files,
   * and contain overrides for specific domains.
   *
   * @return the list of Files as an array
   */
  public File[] getSettingsXMLfiles() {
    return settingsXMLfiles;
  }

  /**
   * Get the id of the HarvestDefinition from which this job originates.
   *
   * @return the id as a Long
   */
  public Long getOrigHarvestDefinitionID() {
    return origHarvestDefinitionID;
  }

  /**
   * Get the id of this Job.
   *
   * @return the id as a Long
   */
  public Long getJobID() {
    return jobID;
  }

  /**
   * Set the id of this Job.
   *
   * @param id The Id for this job.
   */
  public void setJobID(Long id) {
    jobID = id;
  }

  /**
   * Get's the total number of different domains harvested by this job.
   *
   * @return the number of configurations added to this domain
   */
  public int getCountDomains() {
    return domainConfigurationMap.size();
  }

  /**
   * Set the actual time when this job was started.
   *
   * <p>Sends a notification, if actualStart is set to a time after actualStop.
   *
   * @param actualStart A Date object representing the time when this job was started.
   */
  public void setActualStart(Date actualStart) {
    ArgumentNotValid.checkNotNull(actualStart, "actualStart");
    if (actualStop != null && actualStop.before(actualStart)) {
      log.warn(
          "Job("
              + getJobID()
              + "): Start time ("
              + actualStart
              + ") is after end time: "
              + actualStop);
    }
    this.actualStart = (Date) actualStart.clone();
  }

  /**
   * Set the actual time when this job was stopped/completed. Sends a notification, if actualStop is
   * set to a time before actualStart.
   *
   * @param actualStop A Date object representing the time when this job was stopped.
   * @throws ArgumentNotValid
   */
  public void setActualStop(Date actualStop) throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(actualStop, "actualStop");
    if (actualStart == null) {
      log.warn("Job(" + getJobID() + "): actualStart should be defined before setting actualStop");
    } else if (actualStop.before(actualStart)) {
      log.warn(
          "Job("
              + getJobID()
              + "): actualStop ("
              + actualStop
              + ") is before actualStart: "
              + actualStart);
    }
    this.actualStop = (Date) actualStop.clone();
  }

  /**
   * Set the orderxml for this job.
   *
   * @param doc A orderxml to be used by this job
   */
  public void setOrderXMLDoc(HeritrixTemplate doc) {
    ArgumentNotValid.checkNotNull(doc, "doc");
    this.orderXMLdoc = doc;
  }

  /**
   * Gets a document representation of the order.xml associated with this Job.
   *
   * @return the XML as a org.dom4j.Document
   */
  public HeritrixTemplate getOrderXMLdoc() {
    return orderXMLdoc;
  }

  //    /**
  //     * Gets a list of document representations of the settings.xml's associated with this Job.
  //     *
  //     * @return the XML as an array of org.dom4j.Document
  //     */
  //    public Document[] getSettingsXMLdocs() {
  //        return settingsXMLdocs;
  //    }

  /**
   * Set the seedlist of the job from the seedList argument. Individual seeds are separated by a
   * '\n' character. Duplicate seeds are removed.
   *
   * @param seedList List of seeds as one String
   */
  public void setSeedList(String seedList) {
    ArgumentNotValid.checkNotNullOrEmpty(seedList, "seedList");
    seedListSet = new HashSet<>();
    BufferedReader reader = new BufferedReader(new StringReader(seedList));
    String seed;
    try {
      while ((seed = reader.readLine()) != null) {
        seedListSet.add(seed); // add to seedlist if not already there
      }
    } catch (IOException e) {
      // This never happens, as we're reading from a string!
      throw new IOFailure("IOException reading from seed string", e);
    } finally {
      IOUtils.closeQuietly(reader);
    }
  }

  /**
   * Get the seedlist as a String. The individual seeds are separated by the character '\n'. The
   * order of the seeds are unknown.
   *
   * @return the seedlist as a String
   */
  public String getSeedListAsString() {
    return StringUtils.conjoin("\n", seedListSet);
  }

  /**
   * Get the current status of this Job.
   *
   * @return the status as an int in the range 0 to 4.
   */
  public JobStatus getStatus() {
    return status;
  }

  /**
   * Sets status of this job.
   *
   * @param newStatus Must be one of the values STATUS_NEW, ..., STATUS_FAILED
   * @throws ArgumentNotValid in case of invalid status argument or invalid status change
   */
  public void setStatus(JobStatus newStatus) {
    ArgumentNotValid.checkNotNull(newStatus, "newStatus");
    if (!status.legalChange(newStatus)) {
      final String message =
          "Status change from " + status + " to " + newStatus + " is not allowed";
      log.debug(message);
      throw new ArgumentNotValid(message);
    }

    if ((this.status == JobStatus.NEW || this.status == JobStatus.RESUBMITTED)
        && newStatus == JobStatus.SUBMITTED) {
      orderXMLdoc.configureQuotaEnforcer(
          maxObjectsIsSetByQuotaEnforcer, forceMaxBytesPerDomain, forceMaxObjectsPerDomain);
    }

    if (this.status == JobStatus.SUBMITTED && newStatus == JobStatus.STARTED) {
      setActualStart(new Date());
    }
    if (this.status == JobStatus.STARTED
        && (newStatus == JobStatus.DONE || newStatus == JobStatus.FAILED)) {
      setActualStop(new Date());
    }
    status = newStatus;
  }

  /**
   * Returns a map of domain names and name of their corresponding configuration.
   *
   * <p>The returned Map cannot be changed.
   *
   * @return a read-only Map (<String>, <String>)
   */
  public Map<String, String> getDomainConfigurationMap() {
    return Collections.unmodifiableMap(domainConfigurationMap);
  }

  /**
   * Gets the maximum number of objects harvested per domain.
   *
   * @return The maximum number of objects harvested per domain. 0 means no limit.
   */
  public long getMaxObjectsPerDomain() {
    return forceMaxObjectsPerDomain;
  }

  /**
   * Gets the maximum number of bytes harvested per domain.
   *
   * @return The maximum number of bytes harvested per domain. -1 means no limit.
   */
  public long getMaxBytesPerDomain() {
    return forceMaxBytesPerDomain;
  }

  /**
   * Get the edition number.
   *
   * @return The edition number
   */
  long getEdition() {
    return edition;
  }

  /**
   * Set the edition number.
   *
   * @param edition the new edition number
   */
  void setEdition(long edition) {
    this.edition = edition;
  }

  public void setHarvestChannel(HarvestChannel harvestChannel) {
    this.channel = harvestChannel.getName();
    this.isSnapshot = harvestChannel.isSnapshot();
  }

  /** @return the associated {@link HarvestChannel} name. */
  public String getChannel() {
    return channel;
  }

  /**
   * Sets the associated {@link HarvestChannel} name.
   *
   * @param channel the channel name
   */
  public void setChannel(String channel) {
    this.channel = channel;
  }

  /**
   * @return true if the job belongs to a snapshot harvest, false if it belongs to a focused
   *     harvest.
   */
  public boolean isSnapshot() {
    return isSnapshot;
  }

  /**
   * Sets whether job belongs to a snapshot or focused harvest.
   *
   * @param isSnapshot true if the job belongs to a snapshot harvest, false if it belongs to a
   *     focused harvest.
   */
  public void setSnapshot(boolean isSnapshot) {
    this.isSnapshot = isSnapshot;
  }

  @Override
  public String toString() {
    return "Job "
        + getJobID()
        + " (state = "
        + getStatus()
        + ", HD = "
        + getOrigHarvestDefinitionID()
        + ", channel = "
        + getChannel()
        + ", snapshot = "
        + isSnapshot()
        + ", forcemaxcount = "
        + getForceMaxObjectsPerDomain()
        + ", forcemaxbytes = "
        + getMaxBytesPerDomain()
        + ", forcemaxrunningtime = "
        + forceMaxRunningTime
        + ", orderxml = "
        + getOrderXMLName()
        + ", numconfigs = "
        + getDomainConfigurationMap().size()
        + ", created = "
        + getCreationDate()
        + (getSubmittedDate() != null ? ", submitted = " + getSubmittedDate() : "")
        + (getActualStart() != null ? ", started = " + getActualStart() : "")
        + (getActualStop() != null ? ", stopped = " + getActualStop() : "")
        + ")";
  }

  /** @return Returns the forceMaxObjectsPerDomain. 0 means no limit. */
  public long getForceMaxObjectsPerDomain() {
    return forceMaxObjectsPerDomain;
  }

  /**
   * Sets the maxObjectsPerDomain value.
   *
   * @param maxObjectsPerDomain The forceMaxObjectsPerDomain to set. 0 means no limit.
   * @throws IOFailure Thrown from auxiliary method editOrderXML_maxObjectsPerDomain.
   */
  protected void setMaxObjectsPerDomain(long maxObjectsPerDomain) {
    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }

    this.forceMaxObjectsPerDomain = maxObjectsPerDomain;
    orderXMLdoc.setMaxObjectsPerDomain(
        maxObjectsPerDomain); // FIXME? add argument to maxObjectsIsSetByQuotaEnforcer to method
    // setMaxObjectsPerDomain
    // orderXMLdoc.editOrderXML_maxObjectsPerDomain(orderXMLdoc, maxObjectsPerDomain,
    //        maxObjectsIsSetByQuotaEnforcer);

    if (0L == maxObjectsPerDomain && 0L != forceMaxBytesPerDomain) {
      setMaxBytesPerDomain(0L);
    }
  }

  /**
   * Set the maxbytes per domain value.
   *
   * @param maxBytesPerDomain The maxBytesPerDomain to set, or -1 for no limit.
   */
  protected void setMaxBytesPerDomain(long maxBytesPerDomain) {
    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }
    this.forceMaxBytesPerDomain = maxBytesPerDomain;
    orderXMLdoc.setMaxBytesPerDomain(maxBytesPerDomain);

    if (0L == maxBytesPerDomain && 0L != forceMaxObjectsPerDomain) {
      setMaxObjectsPerDomain(0L);
    }
  }

  /**
   * Set the maxJobRunningTime value.
   *
   * @param maxJobRunningTime The maxJobRunningTime in seconds to set, or 0 for no limit.
   */
  protected void setMaxJobRunningTime(long maxJobRunningTime) {
    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }
    this.forceMaxRunningTime = maxJobRunningTime;
    orderXMLdoc.setMaxJobRunningTime(maxJobRunningTime);
  }

  /** @return Returns the MaxJobRunningTime. 0 means no limit. */
  public long getMaxJobRunningTime() {
    return forceMaxRunningTime;
  }

  /**
   * Get the harvestNum for this job. The number reflects which run of the harvest definition this
   * is.
   *
   * @return the harvestNum for this job.
   */
  public int getHarvestNum() {
    return harvestNum;
  }

  /**
   * Set the harvestNum for this job. The number reflects which run of the harvest definition this
   * is. ONLY TO BE USED IN THE CONSTRUCTION PHASE.
   *
   * @param harvestNum a given harvestNum
   */
  public void setHarvestNum(int harvestNum) {
    if (!underConstruction) {
      final String msg = "Cannot modify job " + this + " as it is no longer under construction";
      log.debug(msg);
      throw new IllegalState(msg);
    }
    this.harvestNum = harvestNum;
  }

  /**
   * Get the list of harvest errors for this job. If no harvest errors, null is returned This value
   * is not meaningful until the job is finished (FAILED,DONE, RESUBMITTED)
   *
   * @return the harvest errors for this job or null if no harvest errors.
   */
  public String getHarvestErrors() {
    return harvestErrors;
  }

  /**
   * Append to the list of harvest errors for this job. Nothing happens, if argument harvestErrors
   * is null.
   *
   * @param harvestErrors a string containing harvest errors (may be null)
   */
  public void appendHarvestErrors(String harvestErrors) {
    if (harvestErrors != null) {
      if (this.harvestErrors == null) {
        this.harvestErrors = harvestErrors;
      } else {
        this.harvestErrors += "\n" + harvestErrors;
      }
    }
  }

  /**
   * Get the list of harvest error details for this job. If no harvest error details, null is
   * returned This value is not meaningful until the job is finished (FAILED,DONE, RESUBMITTED)
   *
   * @return the list of harvest error details for this job or null if no harvest error details.
   */
  public String getHarvestErrorDetails() {
    return harvestErrorDetails;
  }

  /**
   * Append to the list of harvest error details for this job. Nothing happens, if argument
   * harvestErrorDetails is null.
   *
   * @param harvestErrorDetails a string containing harvest error details.
   */
  public void appendHarvestErrorDetails(String harvestErrorDetails) {
    if (harvestErrorDetails != null) {
      if (this.harvestErrorDetails == null) {
        this.harvestErrorDetails = harvestErrorDetails;
      } else {
        this.harvestErrorDetails += "\n" + harvestErrorDetails;
      }
    }
  }

  /**
   * Get the list of upload errors. If no upload errors, null is returned. This value is not
   * meaningful until the job is finished (FAILED,DONE, RESUBMITTED)
   *
   * @return the list of upload errors as String, or null if no upload errors.
   */
  public String getUploadErrors() {
    return uploadErrors;
  }

  /**
   * Append to the list of upload errors. Nothing happens, if argument uploadErrors is null.
   *
   * @param uploadErrors a string containing upload errors.
   */
  public void appendUploadErrors(String uploadErrors) {
    if (uploadErrors != null) {
      if (this.uploadErrors == null) {
        this.uploadErrors = uploadErrors;
      } else {
        this.uploadErrors += "\n" + uploadErrors;
      }
    }
  }

  /**
   * Get the list of upload error details. If no upload error details, null is returned. This value
   * is not meaningful until the job is finished (FAILED,DONE, RESUBMITTED)
   *
   * @return the list of upload error details as String, or null if no upload error details
   */
  public String getUploadErrorDetails() {
    return uploadErrorDetails;
  }

  /**
   * Append to the list of upload error details. Nothing happens, if argument uploadErrorDetails is
   * null.
   *
   * @param uploadErrorDetails a string containing upload error details.
   */
  public void appendUploadErrorDetails(String uploadErrorDetails) {
    if (uploadErrorDetails != null) {
      if (this.uploadErrorDetails == null) {
        this.uploadErrorDetails = uploadErrorDetails;
      } else {
        this.uploadErrorDetails += "\n" + uploadErrorDetails;
      }
    }
  }

  /**
   * Get the ID for the job which this job was resubmitted as. If null, this job has not been
   * resubmitted.
   *
   * @return this ID.
   */
  public Long getResubmittedAsJob() {
    return resubmittedAsJobWithID;
  }

  /**
   * Set the Date for when this job was submitted. If null, this job has not been submitted.
   *
   * @param submittedDate The date when this was submitted
   */
  public void setSubmittedDate(Date submittedDate) {
    this.submittedDate = submittedDate;
  }

  /**
   * Set the Date for when this job was created. If null, this job has not been created.
   *
   * @param creationDate The date when this was created
   */
  public void setCreationDate(Date creationDate) {
    this.creationDate = creationDate;
  }

  /**
   * Set the ID for the job which this job was resubmitted as.
   *
   * @param resubmittedAsJob An Id for a new job.
   */
  public void setResubmittedAsJob(Long resubmittedAsJob) {
    this.resubmittedAsJobWithID = resubmittedAsJob;
  }

  /**
   * @return id of the job that this job is supposed to continue using Heritrix recover-log or null
   *     if it starts from scratch.
   */
  public Long getContinuationOf() {
    return this.continuationOF;
  }

  @Override
  public String getHarvestFilenamePrefix() {
    if (this.harvestnamePrefix == null) {
      log.warn(
          "HarvestnamePrefix not yet set for job {}. Set it by using the naming scheme. "
              + "This should only happen for old jobs being read",
          this.jobID);
      setDefaultHarvestNamePrefix();
    }
    return this.harvestnamePrefix;
  }

  /** @param prefix */
  public void setHarvestFilenamePrefix(String prefix) {
    this.harvestnamePrefix = prefix;
  }

  /** @return the forceMaxBytesPerDomain */
  public long getForceMaxBytesPerDomain() {
    return forceMaxBytesPerDomain;
  }

  /** @return the configurationSetsObjectLimit */
  public boolean isConfigurationSetsObjectLimit() {
    return configurationSetsObjectLimit;
  }

  /** @return the configurationSetsByteLimit */
  public boolean isConfigurationSetsByteLimit() {
    return configurationSetsByteLimit;
  }

  /** @return the minCountObjects */
  public long getMinCountObjects() {
    return minCountObjects;
  }

  /** @return the maxCountObjects */
  public long getMaxCountObjects() {
    return maxCountObjects;
  }

  /** @return the totalCountObjects */
  public long getTotalCountObjects() {
    return totalCountObjects;
  }

  void setDefaultHarvestNamePrefix() {
    if (getJobID() != null) {
      ArchiveFileNaming naming = ArchiveFileNamingFactory.getInstance();
      log.debug("Applying the default ArchiveFileNaming class '{}'.", naming.getClass().getName());
      final String prefix = naming.getPrefix(this);
      setHarvestFilenamePrefix(prefix);
      log.debug("The harvestPrefix of this job is: {}", prefix);
    } else {
      log.warn(
          "The harvestnamePrefix is not set now, as it depends on the JobID, which is not set yet");
    }
  }

  /** @return the harvest-audience. */
  public String getHarvestAudience() {
    return harvestAudience;
  }

  /**
   * Set the harvest audience for this job. Taken from the harvestdefinition that generated this
   * job.
   *
   * @param theAudience the harvest-audience.
   */
  public void setHarvestAudience(String theAudience) {
    this.harvestAudience = theAudience;
  }

  ///////////// The following two methods are needed by harvestStatus-jobdetails.jsp
  // ////////////////////////////////////
  /**
   * Returns a list of sorted seeds for this job. The sorting is by domain, and inside each domain,
   * the list is sorted by url
   *
   * @return a list of sorted seeds for this job.
   */
  public List<String> getSortedSeedList() {
    Map<String, Set<String>> urlMap = new HashMap<String, Set<String>>();
    for (String seed : seedListSet) {
      String url;
      // Assume the protocol is http://, if it is missing
      if (!seed.matches(Constants.PROTOCOL_REGEXP)) {
        url = "http://" + seed;
      } else {
        url = seed;
      }
      String domain = getDomain(url);
      if (domain == null) {
        // stop processing this url, and continue to the next seed
        continue;
      }
      Set<String> set;
      if (urlMap.containsKey(domain)) {
        set = urlMap.get(domain);
      } else {
        set = new TreeSet<String>();
        urlMap.put(domain, set);
      }
      set.add(seed);
    }
    List<String> result = new ArrayList<String>();
    for (Set<String> set : urlMap.values()) {
      result.addAll(set);
    }
    return result;
  }
  /**
   * Get the domain, that the given URL belongs to.
   *
   * @param url an URL
   * @return the domain, that the given URL belongs to, or null if unable to do so.
   */
  private String getDomain(String url) {
    try {
      URL uri = new URL(url);
      return DomainUtils.domainNameFromHostname(uri.getHost());
    } catch (MalformedURLException e) {
      log.warn("The string '{}' is not a valid URL", url);
      return null;
    }
  }
}
コード例 #17
0
/** Unit test for the archive upload tool. */
public class UploadTester {
  private UseTestRemoteFile ulrf = new UseTestRemoteFile();
  private PreventSystemExit pse = new PreventSystemExit();
  private PreserveStdStreams pss = new PreserveStdStreams(true);
  private MoveTestFiles mtf = new MoveTestFiles(TestInfo.DATA_DIR, TestInfo.WORKING_DIR);
  private MockupJMS mjms = new MockupJMS();
  private MockupArcRepositoryClient marc = new MockupArcRepositoryClient();
  ReloadSettings rs = new ReloadSettings();

  /** Max number of store retries. */
  private final int storeRetries =
      Settings.getInt(JMSArcRepositoryClient.ARCREPOSITORY_STORE_RETRIES);

  @Before
  public void setUp() {
    rs.setUp();
    ulrf.setUp();
    mjms.setUp();
    mtf.setUp();
    pss.setUp();
    pse.setUp();
    marc.setUp();
    Settings.set(CommonSettings.NOTIFICATIONS_CLASS, RememberNotifications.class.getName());
  }

  @After
  public void tearDown() {
    marc.tearDown();
    pse.tearDown();
    pss.tearDown();
    mtf.tearDown();
    mjms.tearDown();
    ulrf.tearDown();
    RememberNotifications.resetSingleton();
    rs.tearDown();
  }

  @Test
  public void testConstructor() {
    ReflectUtils.testUtilityConstructor(Upload.class);
  }

  /** Verify that uploading a single ARC file works as expected and deletes the file locally. */
  @Test
  public void testMainOneFile() {
    Upload.main(new String[] {TestInfo.ARC1.getAbsolutePath()});
    assertMsgCount(1, 0);
    assertStoreStatus(0, TestInfo.ARC1, true);
  }

  /**
   * Verify that uploading more than one ARC file works as expected and deletes the files locally.
   */
  @Test
  public void testMainSeveralFiles() {
    Upload.main(new String[] {TestInfo.ARC1.getAbsolutePath(), TestInfo.ARC2.getAbsolutePath()});
    assertMsgCount(2, 0);
    assertStoreStatus(0, TestInfo.ARC1, true);
    assertStoreStatus(1, TestInfo.ARC2, true);
  }

  /**
   * Verify that non-ARC files are rejected and execution fails. Also verifies that nothing is
   * stored in that case.
   */
  @Test
  public void testMainNonArc() {
    try {
      Upload.main(
          new String[] {TestInfo.ARC1.getAbsolutePath(), TestInfo.INDEX_DIR.getAbsolutePath()});
      fail("Calling Upload with non-arc file should System.exit");
    } catch (SecurityException e) {
      // Expected
      assertMsgCount(0, 0);
    }
  }

  /** Verify that uploading a single WARC file works as expected and deletes the file locally. */
  @Test
  public void testMainOneWarcFile() {
    Upload.main(new String[] {TestInfo.WARC1.getAbsolutePath()});
    assertMsgCount(1, 0);
    assertStoreStatus(0, TestInfo.WARC1, true);
  }

  /**
   * Verify that the system fails as expected when the store operation fails on the server side.
   * (Local files must NOT be deleted).
   */
  @Test
  public void testMainStoreFails1() {
    marc.failOnFile(TestInfo.ARC1.getName());

    Upload.main(
        new String[] {
          TestInfo.ARC1.getAbsolutePath(),
          TestInfo.ARC2.getAbsolutePath(),
          TestInfo.ARC3.getAbsolutePath()
        });
    assertMsgCount(2, 1);
    int index = 0;
    for (int i = 0; i < storeRetries; i++) {
      assertStoreStatus(index, TestInfo.ARC1, false);
      index++;
    }
    assertStoreStatus(index, TestInfo.ARC2, true);
    index++;
    assertStoreStatus(index, TestInfo.ARC3, true);
  }

  /**
   * Verify that the system fails as expected when the store operation fails on the server side.
   * (Local files must NOT be deleted).
   */
  @Test
  public void testMainStoreFails2() {
    marc.failOnFile(TestInfo.ARC2.getName());

    Upload.main(
        new String[] {
          TestInfo.ARC1.getAbsolutePath(),
          TestInfo.ARC2.getAbsolutePath(),
          TestInfo.ARC3.getAbsolutePath()
        });
    assertMsgCount(2, 1);
    int index = 0;
    assertStoreStatus(index, TestInfo.ARC1, true);
    index++;
    for (int i = 0; i < storeRetries; i++) {
      assertStoreStatus(index, TestInfo.ARC2, false);
      index++;
    }
    assertStoreStatus(index, TestInfo.ARC3, true);
  }

  /**
   * Verify that the system fails as expected when the store operation fails on the server side.
   * (Local files must NOT be deleted).
   */
  @Test
  public void testMainStoreFails3() {
    marc.failOnFile(TestInfo.ARC3.getName());

    Upload.main(
        new String[] {
          TestInfo.ARC1.getAbsolutePath(),
          TestInfo.ARC2.getAbsolutePath(),
          TestInfo.ARC3.getAbsolutePath()
        });
    assertMsgCount(2, 1);
    int index = 0;
    assertStoreStatus(index, TestInfo.ARC1, true);
    index++;
    assertStoreStatus(index, TestInfo.ARC2, true);
    index++;
    for (int i = 0; i < storeRetries; i++) {
      assertStoreStatus(index, TestInfo.ARC3, false);
      index++;
    }
  }

  /**
   * Verifies that calling Upload without arguments fails. Also verifies that nothing is stored in
   * that case.
   */
  @Test
  public void testNoArguments() {
    try {
      Upload.main(new String[] {});
      fail("Calling Upload without arguments should System.exit");
    } catch (SecurityException e) {
      // Expected
      assertMsgCount(0, 0);
    }
  }

  /**
   * Asserts that we got the expected number of StoreMessages.
   *
   * @param succeeded Number of files successfully stored
   * @param failed Number of files that never got stored
   */
  private void assertMsgCount(int succeeded, int failed) {
    int expected = succeeded + failed * storeRetries;
    assertEquals(
        "Upload should generate exactly 1 StoreMessage "
            + "per succeeded arc file and "
            + storeRetries
            + " per failed store",
        expected,
        marc.getMsgCount());
  }

  /**
   * Asserts that the nth StoreMessage is regarding the given arc file and that the arc file is
   * delete if and only if store succeeded.
   *
   * @param n The relevant index to marc.getStoreMsgs()
   * @param arcFile The arc file that was stored
   * @param shouldSucceed Whether store was supposed to succeed
   */
  private void assertStoreStatus(int n, File arcFile, boolean shouldSucceed) {
    StoreMessage sm = marc.getStoreMsgs().get(n);
    assertEquals(
        "Upload should attempt to upload the specified files",
        arcFile.getName(),
        sm.getArcfileName());
    if (shouldSucceed) {
      assertFalse("Upload should delete a properly uploaded file", arcFile.exists());
    } else {
      assertTrue(
          "Upload should not delete a file that wasn't " + " properly uploaded", arcFile.exists());
    }
  }
}