/** * Create bitarchive with access denied to location of admin data verify that exceptions are * thrown */ @Test public void testAccessDenied() { // Make sure archive exists assertTrue("Inaccessible archive dir must exist", NOACCESS_ARCHIVE_DIR.exists()); if (NOACCESS_ARCHIVE_DIR.canWrite()) { NOACCESS_ARCHIVE_DIR.setReadOnly(); } // and that admin file is inaccessible assertFalse( "Must not be able to write to inaccessible admin file", NOACCESS_ARCHIVE_DIR.canWrite()); try { Settings.set( ArchiveSettings.BITARCHIVE_SERVER_FILEDIR, NOACCESS_ARCHIVE_DIR.getAbsolutePath()); Bitarchive ba = Bitarchive.getInstance(); ba.close(); fail("Accessing read-only archive should throw exception"); // do not come here } catch (PermissionDenied e) { // Expected case StringAsserts.assertStringContains( "Should mention noaccess dir", "noaccess/filedir", e.getMessage()); } }
/** FIXME Fails in Hudson */ public void failingTestExecute() { File arcFile = new File(TestInfo.BATCH_DIR, "MimeUrlSearch.jar"); assertTrue(arcFile.isFile()); Settings.set( "settings.common.batch.batchjobs.batchjob.class", "dk.netarkivet.common.utils.batch.UrlSearch"); Settings.set("settings.common.batch.batchjobs.batchjob.jarfile", arcFile.getAbsolutePath()); MockHttpServletRequest request = new MockHttpServletRequest() { @Override public Locale getLocale() { return new Locale("en"); } @Override public int getRemotePort() { return 0; // To change body of implemented methods use File | Settings | File Templates. } @Override public String getLocalName() { return null; // To change body of implemented methods use File | Settings | File // Templates. } @Override public String getLocalAddr() { return null; // To change body of implemented methods use File | Settings | File // Templates. } @Override public int getLocalPort() { return 0; // To change body of implemented methods use File | Settings | File Templates. } }; request.setupAddParameter(Constants.FILETYPE_PARAMETER, BatchFileType.Metadata.toString()); request.setupAddParameter(Constants.JOB_ID_PARAMETER, "1234567890"); request.setupAddParameter( Constants.BATCHJOB_PARAMETER, "dk.netarkivet.common.utils.batch.UrlSearch"); request.setupAddParameter(Constants.REPLICA_PARAMETER, "BarOne"); request.setupAddParameter("arg1", "DUMMY-ARG"); request.setupAddParameter("arg2", "url"); request.setupAddParameter("arg3", "mimetype"); Locale l = new Locale("en"); JspWriterMockup out = new JspWriterMockup(); PageContext context = new WebinterfaceTestCase.TestPageContext(request, out, l); BatchGUI.execute(context); }
@Before public void setUp() { rs.setUp(); mjms.setUp(); listener = new CreateIndexListener(); JMSConnectionFactory.getInstance().setListener(Channels.getTheIndexServer(), listener); Settings.set( CommonSettings.REMOTE_FILE_CLASS, "dk.netarkivet.common.distribute.NullRemoteFile"); Settings.set(CommonSettings.CACHE_DIR, TestInfo.CACHE_DIR.getPath()); mtf.setUp(); pss.setUp(); pse.setUp(); }
public void setUp() throws Exception { ChannelsTester.resetChannels(); rs.setUp(); mtf.setUp(); utrf.setUp(); JMSConnectionMockupMQ.useJMSConnectionMockupMQ(); DatabaseTestUtils.takeDatabase(TestInfo.DATABASE_FILE, TestInfo.DATABASE_DIR); // define the settings for accessing the database Settings.set(ArchiveSettings.BASEURL_ARCREPOSITORY_ADMIN_DATABASE, TestInfo.DATABASE_URL); Settings.set(ArchiveSettings.MACHINE_ARCREPOSITORY_ADMIN_DATABASE, ""); Settings.set(ArchiveSettings.PORT_ARCREPOSITORY_ADMIN_DATABASE, ""); Settings.set(ArchiveSettings.DIR_ARCREPOSITORY_ADMIN_DATABASE, ""); Settings.set(CommonSettings.NOTIFICATIONS_CLASS, PrintNotifications.class.getName()); }
/** * Calls the Unix sort command with the options <code>$filesNames -o * $outputfile -T WaybackSettings#WAYBACK_AGGREGATOR_TEMP_DIR. * * Sets the LC_ALL environment variable before making the call. * * @param files The files to merge and sort * @param outputFile The resulting sorted file * @param additionalArgs A list af extra arguments, which (if different from * null) are added to the sort call.<p> Note: If any * of the args contain a whitespace the call will * fail. */ private void processFiles(File[] files, File outputFile, List<String> additionalArgs) { if (files.length == 0) { // Empty file list will cause sort to wait for further input, // and the call will therefore never return return; } Process p = null; try { List<String> inputFileList = new LinkedList<String>(); for (int i = 0; i < files.length; i++) { if (files[i].exists() && files[i].isFile()) { inputFileList.add(files[i].getCanonicalPath()); } else { log.warn( "File " + files[i] + " doesn't exist or isn't a regular file, " + "dropping from list of files to " + "sort and merge"); } } List<String> cmd = new LinkedList<String>(); // Prepare to run the unix sort command, see sort manual page for // details cmd.add("sort"); cmd.addAll(inputFileList); cmd.add("-o"); cmd.add(outputFile.getCanonicalPath()); cmd.add("-T"); cmd.add(Settings.get(WaybackSettings.WAYBACK_AGGREGATOR_TEMP_DIR)); if (additionalArgs != null && !additionalArgs.isEmpty()) { for (String argument : additionalArgs) { ArgumentNotValid.checkTrue( argument.indexOf(' ') == -1, "The argument '" + argument + "' contains spaces, this isn't allowed "); } cmd.addAll(additionalArgs); } ProcessBuilder pb = new ProcessBuilder(cmd); // Reset all locale definitions pb.environment().put("LC_ALL", "C"); // Run the command in the user.dir directory pb.directory(new File(System.getProperty("user.dir"))); p = pb.start(); p.waitFor(); if (p.exitValue() != 0) { log.error("Failed to sort index files, sort exited with " + "return code " + p.exitValue()); } } catch (Exception e) { log.error("Failed to aggregate indexes ", e); } }
@Before public void setUp() { rs.setUp(); ulrf.setUp(); mjms.setUp(); mtf.setUp(); pss.setUp(); pse.setUp(); marc.setUp(); Settings.set(CommonSettings.NOTIFICATIONS_CLASS, RememberNotifications.class.getName()); }
/** Create bitarchive from scratch, no admin data and log files exists */ @Test public void testFromScratch() { LogbackRecorder lr = LogbackRecorder.startRecorder(); assertFalse("No bitarchive should exist before creating it", NEW_ARCHIVE_DIR.exists()); // Create new test archive and close it Settings.set(ArchiveSettings.BITARCHIVE_SERVER_FILEDIR, NEW_ARCHIVE_DIR.getAbsolutePath()); Bitarchive ba = Bitarchive.getInstance(); ba.close(); // verify that the directory, admin and log files are created assertTrue("The archive dir should exist after creation", NEW_ARCHIVE_DIR.exists()); assertTrue("Log file should exist after creation", !lr.isEmpty()); lr.stopRecorder(); }
@Before public void setUp() throws Exception { rs.setUp(); mtf.setUp(); JMSConnectionMockupMQ.useJMSConnectionMockupMQ(); Settings.set(ArchiveSettings.DIRS_ARCREPOSITORY_ADMIN, TestInfo.WORKING_DIR.getAbsolutePath()); if (!Replica.isKnownReplicaId("TWO") || !Replica.isKnownReplicaId("ONE")) { List<String> knownIds = new ArrayList<String>(); fail( "These tests assume, that ONE and TWO are known replica ids. Only known replicas are: " + StringUtils.conjoin(", ", knownIds.toArray(Replica.getKnownIds()))); } // super.setUp(); }
public class IndexerTestCase { private String oldClient = System.getProperty(CommonSettings.ARC_REPOSITORY_CLIENT); private String oldFileDir = System.getProperty("settings.common.arcrepositoryClient.fileDir"); protected static File tempdir = new File(Settings.get(WaybackSettings.WAYBACK_INDEX_TEMPDIR)); ReloadSettings rs = new ReloadSettings(); @Before public void setUp() { rs.setUp(); System.setProperty(WaybackSettings.HIBERNATE_HBM2DDL_AUTO, "create-drop"); HibernateUtil.getSession().getSessionFactory().close(); FileUtils.removeRecursively(TestInfo.WORKING_DIR); TestFileUtils.copyDirectoryNonCVS(TestInfo.ORIGINALS_DIR, TestInfo.WORKING_DIR); System.setProperty( CommonSettings.ARC_REPOSITORY_CLIENT, "dk.netarkivet.common.distribute.arcrepository.LocalArcRepositoryClient"); System.setProperty( "settings.common.arcrepositoryClient.fileDir", TestInfo.FILE_DIR.getAbsolutePath()); System.setProperty( CommonSettings.REMOTE_FILE_CLASS, "dk.netarkivet.common.distribute.TestRemoteFile"); assertTrue( ArcRepositoryClientFactory.getPreservationInstance() instanceof LocalArcRepositoryClient); } @After public void tearDown() { HibernateUtil.getSession().getSessionFactory().close(); FileUtils.removeRecursively(TestInfo.WORKING_DIR); if (oldClient != null) { System.setProperty(CommonSettings.ARC_REPOSITORY_CLIENT, oldClient); } else { System.setProperty(CommonSettings.ARC_REPOSITORY_CLIENT, ""); } if (oldFileDir != null) { System.setProperty("settings.common.arcrepositoryClient.fileDir", oldFileDir); } else { System.setProperty("settings.common.arcrepositoryClient.fileDir", ""); } rs.tearDown(); } }
public void setUp() { mtf.setUp(); rs.setUp(); Settings.set(CommonSettings.BATCHJOBS_BASEDIR, TestInfo.BATCH_DIR.getAbsolutePath()); }
/** * Batchjob that extracts lines from a crawl log matching a regular expression The batch job should * be restricted to run on metadata files for a specific job only, using the {@link * #processOnlyFilesMatching(String)} construct. */ @SuppressWarnings({"serial"}) public class CrawlLogLinesMatchingRegexp extends ArchiveBatchJob { /** The logger. */ // private final Log log = LogFactory.getLog(getClass().getName()); private static final Logger log = LoggerFactory.getLogger(CrawlLogLinesMatchingRegexp.class); /** Metadata URL for crawl logs. */ private static final String SETUP_URL_FORMAT = String.format( "metadata://%s/crawl/logs/crawl.log", Settings.get(CommonSettings.ORGANIZATION)); /** The regular expression to match in the crawl.log line. */ private final String regexp; /** * Initialise the batch job. * * @param regexp The regexp to match in the crawl.log lines. */ public CrawlLogLinesMatchingRegexp(String regexp) { ArgumentNotValid.checkNotNullOrEmpty(regexp, "regexp"); this.regexp = regexp; /** One week in milliseconds. */ batchJobTimeout = 7 * Constants.ONE_DAY_IN_MILLIES; } /** * Does nothing, no initialisation is needed. * * @param os Not used. */ @Override public void initialize(OutputStream os) {} @Override public ArchiveBatchFilter getFilter() { return new ArchiveBatchFilter("OnlyCrawlLog") { public boolean accept(ArchiveRecordBase record) { String URL = record.getHeader().getUrl(); if (URL == null) { return false; } else { return URL.startsWith(SETUP_URL_FORMAT); } } }; } /** * Process a record on crawl log concerning the given domain to result. * * @param record The record to process. * @param os The output stream for the result. * @throws ArgumentNotValid on null parameters * @throws IOFailure on trouble processing the record. */ @Override public void processRecord(ArchiveRecordBase record, OutputStream os) { ArgumentNotValid.checkNotNull(record, "ArchiveRecordBase record"); ArgumentNotValid.checkNotNull(os, "OutputStream os"); BufferedReader arcreader = new BufferedReader(new InputStreamReader(record.getInputStream())); try { for (String line = arcreader.readLine(); line != null; line = arcreader.readLine()) { if (line.matches(regexp)) { os.write(line.getBytes("UTF-8")); os.write('\n'); } } } catch (IOException e) { throw new IOFailure("Unable to process (w)arc record", e); } finally { try { arcreader.close(); } catch (IOException e) { log.warn("unable to close arcreader probably", e); } } } /** * Does nothing, no finishing is needed. * * @param os Not used. */ @Override public void finish(OutputStream os) {} @Override public String toString() { return getClass().getName() + ", with arguments: Regexp = " + regexp + ", Filter = " + getFilter(); } }
@Test public void testProcessMissingRequest() throws Exception { Settings.set( ArchiveSettings.DIR_ARCREPOSITORY_BITPRESERVATION, TestInfo.WORKING_DIR.getAbsolutePath()); Settings.set(ArchiveSettings.DIRS_ARCREPOSITORY_ADMIN, TestInfo.WORKING_DIR.getAbsolutePath()); // Ensure that a admin data exists before we start. AdminData.getUpdateableInstance(); MockFileBasedActiveBitPreservation mockabp = new MockFileBasedActiveBitPreservation(); MockHttpServletRequest request = new MockHttpServletRequest(); String replicaID1 = "ONE"; String replicaID2 = "TWO"; String filename1 = "foo"; String filename2 = "bar"; Locale defaultLocale = new Locale("da"); // First test a working set of params Map<String, String[]> args = new HashMap<String, String[]>(); args.put( ADD_COMMAND, new String[] { Replica.getReplicaFromId(replicaID1).getName() + STRING_FILENAME_SEPARATOR + filename1 }); request.setupAddParameter( ADD_COMMAND, new String[] { Replica.getReplicaFromId(replicaID1).getName() + STRING_FILENAME_SEPARATOR + filename1 }); args.put(GET_INFO_COMMAND, new String[] {filename1}); request.setupAddParameter(GET_INFO_COMMAND, new String[] {filename1}); args.put(BITARCHIVE_NAME_PARAM, new String[] {Replica.getReplicaFromId(replicaID1).getName()}); request.setupAddParameter( BITARCHIVE_NAME_PARAM, new String[] {Replica.getReplicaFromId(replicaID1).getName()}); request.setupGetParameterMap(args); request.setupGetParameterNames(new Vector<String>(args.keySet()).elements()); Map<String, PreservationState> status = BitpreserveFileState.processMissingRequest( WebinterfaceTestCase.getDummyPageContext(defaultLocale, request), new StringBuilder()); assertEquals("Should have one call to reestablish", 1, mockabp.getCallCount(ADD_METHOD)); assertEquals( "Should have one call to getFilePreservationStatus", 1, mockabp.getCallCount(GET_INFO_METHOD)); assertEquals("Should have one info element (with mock results)", null, status.get(filename1)); // Check that we can call without any params mockabp.calls.clear(); request = new MockHttpServletRequest(); args.clear(); args.put(BITARCHIVE_NAME_PARAM, new String[] {Replica.getReplicaFromId(replicaID1).getName()}); request.setupAddParameter( BITARCHIVE_NAME_PARAM, new String[] {Replica.getReplicaFromId(replicaID1).getName()}); request.setupGetParameterMap(args); status = BitpreserveFileState.processMissingRequest( WebinterfaceTestCase.getDummyPageContext(defaultLocale, request), new StringBuilder()); assertEquals("Should have no call to restablish", 0, mockabp.getCallCount(ADD_METHOD)); assertEquals( "Should have no call to getFilePreservationStatus", 0, mockabp.getCallCount(GET_INFO_METHOD)); assertEquals("Should have no status", 0, status.size()); // Check that we can handle more than one call to each and that the // args are correct. mockabp.calls.clear(); request = new MockHttpServletRequest(); args.clear(); args.put(BITARCHIVE_NAME_PARAM, new String[] {Replica.getReplicaFromId(replicaID2).getName()}); request.setupAddParameter( BITARCHIVE_NAME_PARAM, new String[] {Replica.getReplicaFromId(replicaID2).getName()}); request.setupAddParameter( ADD_COMMAND, new String[] { Replica.getReplicaFromId(replicaID2).getName() + STRING_FILENAME_SEPARATOR + filename1, Replica.getReplicaFromId(replicaID2).getName() + STRING_FILENAME_SEPARATOR + filename1 }); args.put( ADD_COMMAND, new String[] { Replica.getReplicaFromId(replicaID2).getName() + STRING_FILENAME_SEPARATOR + filename1, Replica.getReplicaFromId(replicaID2).getName() + STRING_FILENAME_SEPARATOR + filename1 }); request.setupAddParameter(GET_INFO_COMMAND, new String[] {filename1, filename2, filename1}); args.put(GET_INFO_COMMAND, new String[] {filename1, filename2, filename1}); request.setupGetParameterMap(args); status = BitpreserveFileState.processMissingRequest( WebinterfaceTestCase.getDummyPageContext(defaultLocale, request), new StringBuilder()); assertEquals("Should have two calls to restablish", 2, mockabp.getCallCount(ADD_METHOD)); assertEquals( "Should have three calls to getFilePreservationStatus", 3, mockabp.getCallCount(GET_INFO_METHOD)); assertEquals("Should have two info elements", 2, status.size()); assertEquals("Should have info for filename1", null, status.get(filename1)); assertEquals("Should have info for filename2", null, status.get(filename2)); // Iterator<String> it = mockabp.calls.get(ADD_METHOD).iterator(); // while (it.hasNext()) { // System.out.println(it.next()); // } CollectionAsserts.assertIteratorEquals( "Should have the args given add", Arrays.asList(new String[] {filename1 + "," + replicaID2, filename1 + "," + replicaID2}) .iterator(), mockabp.calls.get(ADD_METHOD).iterator()); CollectionAsserts.assertIteratorEquals( "Should have the args given info", Arrays.asList(new String[] {filename1, filename2, filename1}).iterator(), mockabp.calls.get(GET_INFO_METHOD).iterator()); }
/** * A base class for {@link JobGenerator} implementations. It is recommended to extend this class to * implement a new job generator. * * <p>The base algorithm iterates over domain configurations within the harvest definition, and * according to the configuration ({@link HarvesterSettings#JOBGEN_DOMAIN_CONFIG_SUBSET_SIZE}, * constitutes a subset of domain configurations from which one or more jobs will be generated. */ abstract class AbstractJobGenerator implements JobGenerator { /** Logger for this class. */ private static Log log = LogFactory.getLog(AbstractJobGenerator.class); /** How many domain configurations to process in one go. */ private final long DOMAIN_CONFIG_SUBSET_SIZE = Settings.getLong(HarvesterSettings.JOBGEN_DOMAIN_CONFIG_SUBSET_SIZE); /** Is deduplication enabled or disabled. * */ private final boolean DEDUPLICATION_ENABLED = Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED); @Override public int generateJobs(HarvestDefinition harvest) { log.info("Generating jobs for harvestdefinition # " + harvest.getOid()); int jobsMade = 0; final Iterator<DomainConfiguration> domainConfigurations = harvest.getDomainConfigurations(); while (domainConfigurations.hasNext()) { List<DomainConfiguration> subset = new ArrayList<DomainConfiguration>(); while (domainConfigurations.hasNext() && subset.size() < DOMAIN_CONFIG_SUBSET_SIZE) { subset.add(domainConfigurations.next()); } Collections.sort(subset, getDomainConfigurationSubsetComparator(harvest)); if (log.isTraceEnabled()) { log.trace( subset.size() + " domainconfigs now sorted and ready to processing " + "for harvest #" + harvest.getOid()); } jobsMade += processDomainConfigurationSubset(harvest, subset.iterator()); } harvest.setNumEvents(harvest.getNumEvents() + 1); if (!harvest.isSnapShot()) { PartialHarvest focused = (PartialHarvest) harvest; Schedule schedule = focused.getSchedule(); int numEvents = harvest.getNumEvents(); // Calculate next event Date now = new Date(); Date nextEvent = schedule.getNextEvent(focused.getNextDate(), numEvents); // Refuse to schedule event in the past if (nextEvent != null && nextEvent.before(now)) { int eventsSkipped = 0; while (nextEvent != null && nextEvent.before(now)) { nextEvent = schedule.getNextEvent(nextEvent, numEvents); eventsSkipped++; } if (log.isWarnEnabled()) { log.warn( "Refusing to schedule harvest definition '" + harvest.getName() + "' in the past. Skipped " + eventsSkipped + " events. Old nextDate was " + focused.getNextDate() + " new nextDate is " + nextEvent); } } // Set next event focused.setNextDate(nextEvent); if (log.isTraceEnabled()) { log.trace( "Next event for harvest definition " + harvest.getName() + " happens: " + (nextEvent == null ? "Never" : nextEvent.toString())); } } log.info( "Finished generating " + jobsMade + " jobs for harvestdefinition # " + harvest.getOid()); return jobsMade; } /** * Instantiates a new job. * * @param cfg the {@link DomainConfiguration} being processed * @param harvest the {@link HarvestDefinition} being processed * @return an instance of {@link Job} */ public static Job getNewJob(HarvestDefinition harvest, DomainConfiguration cfg) { HarvestChannelDAO harvestChannelDao = HarvestChannelDAO.getInstance(); HarvestChannel channel = harvestChannelDao.getChannelForHarvestDefinition(harvest.getOid()); if (channel == null) { log.info( "No channel mapping registered for harvest id " + harvest.getOid() + ", will use default."); channel = harvestChannelDao.getDefaultChannel(harvest.isSnapShot()); } if (harvest.isSnapShot()) { return Job.createSnapShotJob( harvest.getOid(), channel, cfg, harvest.getMaxCountObjects(), harvest.getMaxBytes(), ((FullHarvest) harvest).getMaxJobRunningTime(), harvest.getNumEvents()); } return Job.createJob(harvest.getOid(), channel, cfg, harvest.getNumEvents()); } /** * Returns a comparator used to sort the subset of {@link #DOMAIN_CONFIG_SUBSET_SIZE} * configurations that are scanned at each iteration. * * @param harvest the {@link HarvestDefinition} being processed. * @return a comparator */ protected abstract Comparator<DomainConfiguration> getDomainConfigurationSubsetComparator( HarvestDefinition harvest); /** * Create new jobs from a collection of configurations. All configurations must use the same * order.xml file.Jobs * * @param harvest the {@link HarvestDefinition} being processed. * @param domainConfSubset the configurations to use to create the jobs * @return The number of jobs created * @throws ArgumentNotValid if any of the parameters is null or if the cfglist does not contain * any configurations */ protected abstract int processDomainConfigurationSubset( HarvestDefinition harvest, Iterator<DomainConfiguration> domainConfSubset); @Override public boolean canAccept(Job job, DomainConfiguration cfg) { if (!checkAddDomainConfInvariant(job, cfg)) { return false; } return checkSpecificAcceptConditions(job, cfg); } /** * Called by {@link #canAccept(Job, DomainConfiguration)}. Tests the implementation-specific * conditions to accept the given {@link DomainConfiguration} in the given {@link Job}. It is * assumed that {@link #checkAddDomainConfInvariant(Job, DomainConfiguration)} has already passed. * * @param job the {@link Job} n=being built * @param cfg the {@link DomainConfiguration} to test * @return true if the configuration passes the conditions. */ protected abstract boolean checkSpecificAcceptConditions(Job job, DomainConfiguration cfg); /** * Once the job has been filled with {@link DomainConfiguration}s, performs the following * operations: * * <ol> * <li>Edit the harvest template to add/remove deduplicator configuration. * <li> * </ol> * * @param job the job */ protected void editJobOrderXml(Job job) { Document doc = job.getOrderXMLdoc(); if (DEDUPLICATION_ENABLED) { // Check that the Deduplicator element is present in the // OrderXMl and enabled. If missing or disabled log a warning if (!HeritrixTemplate.isDeduplicationEnabledInTemplate(doc)) { if (log.isWarnEnabled()) { log.warn( "Unable to perform deduplication for this job" + " as the required DeDuplicator element is " + "disabled or missing from template"); } } } else { // Remove deduplicator Element from OrderXML if present Node xpathNode = doc.selectSingleNode(HeritrixTemplate.DEDUPLICATOR_XPATH); if (xpathNode != null) { xpathNode.detach(); job.setOrderXMLDoc(doc); if (log.isInfoEnabled()) { log.info("Removed DeDuplicator element because " + "Deduplication is disabled"); } } } } /** * Tests that: * * <ol> * <li>The given domain configuration and job are not null. * <li>The job does not already contain the given domain configuration. * <li>The domain configuration has the same order xml name as the first inserted domain config. * </ol> * * @param job a given Job * @param cfg a given DomainConfiguration * @return true, if the given DomainConfiguration can be inserted into the given job */ private boolean checkAddDomainConfInvariant(Job job, DomainConfiguration cfg) { ArgumentNotValid.checkNotNull(job, "job"); ArgumentNotValid.checkNotNull(cfg, "cfg"); // check if domain in DomainConfiguration cfg is not already in this job // domainName is used as key in domainConfigurationMap if (job.getDomainConfigurationMap().containsKey(cfg.getDomainName())) { if (log.isDebugEnabled()) { log.debug("Job already has a configuration for Domain '" + cfg.getDomainName() + "'."); } return false; } // check if template is same as this job. String orderXMLname = job.getOrderXMLName(); if (!orderXMLname.equals(cfg.getOrderXmlName())) { if (log.isDebugEnabled()) { log.debug( "This Job only accept configurations " + "using the harvest template '" + orderXMLname + "'. This configuration uses the harvest template '" + cfg.getOrderXmlName() + "'."); } return false; } return true; } }
/** * This class implements a generator for an history chart of a running job. The chart traces the * progress percentage and the queued URI count over the crawl time. Charts are rendered in a PNG * image file, generated in the webapp directory. */ class StartedJobHistoryChartGen { /** Time units used to scale the crawl time values and generate the chart's time axis ticks. */ protected static enum TimeAxisResolution { /** One second. Tick step is 10s. */ second(1, 1, 10), /** One minute. Tick step is 5m. */ minute(60, 60, 5), /** One hour. Tick step is 1h. */ hour(60 * minute.seconds, 60 * minute.seconds, 1), /** Twelve hours. Tick step is 2h. */ half_day(12 * 60 * minute.seconds, 60 * minute.seconds, 2), /** One day. Tick step is 0.5d. */ day(24 * hour.seconds, 24 * hour.seconds, 0.5), /** One week. Tick step is 1w. */ week(7 * day.seconds, 7 * day.seconds, 1); /** The time unit in seconds. */ private final int seconds; /** The scale in seconds. */ private final int scaleSeconds; /** The step between two tick units. */ private final double tickStep; /** * Builds a time axis resolution. * * @param seconds the actual resolution in seconds * @param scaleSeconds the actual "scale" of ticks * @param tickStep the number of ticks in one step. */ TimeAxisResolution(int seconds, int scaleSeconds, double tickStep) { this.seconds = seconds; this.scaleSeconds = scaleSeconds; this.tickStep = tickStep; } /** * Scale down an array of seconds. * * @param timeInSeconds An array of seconds * @return a scaled down version of the given array of seconds */ double[] scale(double[] timeInSeconds) { double[] scaledTime = new double[timeInSeconds.length]; for (int i = 0; i < timeInSeconds.length; i++) { scaledTime[i] = timeInSeconds[i] / this.scaleSeconds; } return scaledTime; } /** * @param seconds the seconds * @return the proper timeUnit for the given argument */ static TimeAxisResolution findTimeUnit(double seconds) { TimeAxisResolution[] allTus = values(); for (int i = 0; i < allTus.length - 1; i++) { TimeAxisResolution nextGreater = allTus[i + 1]; if (seconds < nextGreater.seconds) { return allTus[i]; } } return week; // largest unit } } /** A chart generation task. Generates a PNG image for a job progress history. */ private static class ChartGen implements Runnable { /** The process that generates the Charts. */ private final StartedJobHistoryChartGen gen; /** * Constructor of a ChartGen objector. * * @param gen the process that generates the charts. */ ChartGen(StartedJobHistoryChartGen gen) { super(); this.gen = gen; } @Override public void run() { synchronized (gen) { gen.chartFile = null; } long jobId = gen.jobId; StartedJobInfo[] fullHistory = RunningJobsInfoDAO.getInstance().getFullJobHistory(jobId); LinkedList<Double> timeValues = new LinkedList<Double>(); LinkedList<Double> progressValues = new LinkedList<Double>(); LinkedList<Double> urlValues = new LinkedList<Double>(); for (StartedJobInfo sji : fullHistory) { timeValues.add((double) sji.getElapsedSeconds()); progressValues.add(sji.getProgress()); urlValues.add((double) sji.getQueuedFilesCount()); } // Refresh the history png image for the job. File pngFile = new File(gen.outputFolder, jobId + "-history.png"); File newPngFile; try { newPngFile = File.createTempFile(jobId + "-history", "." + System.currentTimeMillis() + ".png"); } catch (IOException e) { LOG.warn("Failed to create temp PNG file for job " + jobId); return; } long startTime = System.currentTimeMillis(); gen.generatePngChart( newPngFile, CHART_RESOLUTION[0], CHART_RESOLUTION[1], null, // no chart title I18N.getString(gen.locale, "running.job.details.chart.legend.crawlTime"), new String[] { I18N.getString(gen.locale, "running.job.details.chart.legend.progress"), I18N.getString(gen.locale, "running.job.details.chart.legend.queuedUris") }, NumberUtils.toPrimitiveArray(timeValues), new double[][] {new double[] {0, 100}, null}, new double[][] { NumberUtils.toPrimitiveArray(progressValues), NumberUtils.toPrimitiveArray(urlValues) }, new Color[] {Color.blue, Color.green.darker()}, new String[] {"%", ""}, false, Color.lightGray.brighter().brighter()); long genTime = System.currentTimeMillis() - startTime; LOG.info( "Generated history chart for job " + jobId + " in " + (genTime < TimeUtils.SECOND_IN_MILLIS ? genTime + " ms" : StringUtils.formatDuration(genTime / TimeUtils.SECOND_IN_MILLIS)) + "."); synchronized (gen) { // Overwrite old file, then delete temp file try { FileUtils.copyFile(newPngFile, pngFile); FileUtils.remove(newPngFile); } catch (IOFailure iof) { LOG.error("IOFailure while copying PNG file", iof); } gen.chartFile = pngFile; } } } /** The class logger. */ static final Log LOG = LogFactory.getLog(StartedJobHistoryChartGen.class); /** Internationalisation object. */ private static final I18n I18N = new I18n(dk.netarkivet.harvester.Constants.TRANSLATIONS_BUNDLE); /** Rate in seconds at which history charts should be generated. */ private static final long GEN_INTERVAL = Settings.getLong(HarvesterSettings.HARVEST_MONITOR_HISTORY_CHART_GEN_INTERVAL); /** The chart image resolution. */ private static final int[] CHART_RESOLUTION = new int[] {600, 450}; /** The dimension of the chart axis. */ private static final double CHART_AXIS_DIMENSION = 10.0; /** The relative path of the output. */ private static final String OUTPUT_REL_PATH = "History" + File.separator + "webapp"; /** The job id. */ private final long jobId; /** The folder where image files are output. */ private final File outputFolder; /** The chart image file. */ private File chartFile = null; /** The locale for internationalizing the chart. The locale is set to the system default. */ private final Locale locale; /** The process controlling the cyclic regeneration of charts. */ private PeriodicTaskExecutor genExec = null; /** * Constructor. Start generating charts for data belonging to the given job. * * @param jobId a job id. */ StartedJobHistoryChartGen(long jobId) { super(); this.outputFolder = new File(FileUtils.getTempDir() + File.separator + OUTPUT_REL_PATH); this.jobId = jobId; // Set the locale to the system default this.locale = Locale.getDefault(); genExec = new PeriodicTaskExecutor("ChartGen", new ChartGen(this), 0, GEN_INTERVAL); } /** * Returns the image file. * * @return the image file. Might return null if no file is currently available. */ public synchronized File getChartFile() { return chartFile; } /** Deletes the chart image if it exists and stops the generation schedule. */ public void cleanup() { if (chartFile != null && chartFile.exists()) { if (!chartFile.delete()) { chartFile.deleteOnExit(); } } if (genExec != null) { genExec.shutdown(); } } /** * Generates a chart in PNG format. * * @param outputFile the output file, it should exist. * @param pxWidth the image width in pixels. * @param pxHeight the image height in pixels. * @param chartTitle the chart title, may be null. * @param xAxisTitle the x axis title * @param yDataSeriesRange the axis range (null for auto) * @param yDataSeriesTitles the Y axis titles. * @param timeValuesInSeconds the time values in seconds * @param yDataSeries the Y axis value series. * @param yDataSeriesColors the Y axis value series drawing colors. * @param yDataSeriesTickSuffix TODO explain argument yDataSeriesTickSuffix * @param drawBorder draw, or not, the border. * @param backgroundColor the chart background color. */ final void generatePngChart( File outputFile, int pxWidth, int pxHeight, String chartTitle, String xAxisTitle, String[] yDataSeriesTitles, double[] timeValuesInSeconds, double[][] yDataSeriesRange, double[][] yDataSeries, Color[] yDataSeriesColors, String[] yDataSeriesTickSuffix, boolean drawBorder, Color backgroundColor) { // Domain axis NumberAxis xAxis = new NumberAxis(xAxisTitle); xAxis.setFixedDimension(CHART_AXIS_DIMENSION); xAxis.setLabelPaint(Color.black); xAxis.setTickLabelPaint(Color.black); double maxSeconds = getMaxValue(timeValuesInSeconds); TimeAxisResolution xAxisRes = TimeAxisResolution.findTimeUnit(maxSeconds); xAxis.setTickUnit(new NumberTickUnit(xAxisRes.tickStep)); double[] scaledTimeValues = xAxisRes.scale(timeValuesInSeconds); String tickSymbol = I18N.getString(locale, "running.job.details.chart.timeunit.symbol." + xAxisRes.name()); xAxis.setNumberFormatOverride(new DecimalFormat("###.##'" + tickSymbol + "'")); // First dataset String firstDataSetTitle = yDataSeriesTitles[0]; XYDataset firstDataSet = createXYDataSet(firstDataSetTitle, scaledTimeValues, yDataSeries[0]); Color firstDataSetColor = yDataSeriesColors[0]; // First range axis NumberAxis firstYAxis = new NumberAxis(firstDataSetTitle); firstYAxis.setFixedDimension(CHART_AXIS_DIMENSION); setAxisRange(firstYAxis, yDataSeriesRange[0]); firstYAxis.setLabelPaint(firstDataSetColor); firstYAxis.setTickLabelPaint(firstDataSetColor); String firstAxisTickSuffix = yDataSeriesTickSuffix[0]; if (firstAxisTickSuffix != null && !firstAxisTickSuffix.isEmpty()) { firstYAxis.setNumberFormatOverride(new DecimalFormat("###.##'" + firstAxisTickSuffix + "'")); } // Create the plot with domain axis and first range axis XYPlot plot = new XYPlot(firstDataSet, xAxis, firstYAxis, null); XYLineAndShapeRenderer firstRenderer = new XYLineAndShapeRenderer(true, false); plot.setRenderer(firstRenderer); plot.setOrientation(PlotOrientation.VERTICAL); plot.setBackgroundPaint(Color.lightGray); plot.setDomainGridlinePaint(Color.white); plot.setRangeGridlinePaint(Color.white); plot.setAxisOffset(new RectangleInsets(5.0, 5.0, 5.0, 5.0)); firstRenderer.setSeriesPaint(0, firstDataSetColor); // Now iterate on next axes for (int i = 1; i < yDataSeries.length; i++) { // Create axis String seriesTitle = yDataSeriesTitles[i]; Color seriesColor = yDataSeriesColors[i]; NumberAxis yAxis = new NumberAxis(seriesTitle); yAxis.setFixedDimension(CHART_AXIS_DIMENSION); setAxisRange(yAxis, yDataSeriesRange[i]); yAxis.setLabelPaint(seriesColor); yAxis.setTickLabelPaint(seriesColor); String yAxisTickSuffix = yDataSeriesTickSuffix[i]; if (yAxisTickSuffix != null && !yAxisTickSuffix.isEmpty()) { yAxis.setNumberFormatOverride(new DecimalFormat("###.##'" + yAxisTickSuffix + "'")); } // Create dataset and add axis to plot plot.setRangeAxis(i, yAxis); plot.setRangeAxisLocation(i, AxisLocation.BOTTOM_OR_LEFT); plot.setDataset(i, createXYDataSet(seriesTitle, scaledTimeValues, yDataSeries[i])); plot.mapDatasetToRangeAxis(i, i); XYItemRenderer renderer = new StandardXYItemRenderer(); renderer.setSeriesPaint(0, seriesColor); plot.setRenderer(i, renderer); } // Create the chart JFreeChart chart = new JFreeChart(chartTitle, JFreeChart.DEFAULT_TITLE_FONT, plot, false); // Customize rendering chart.setBackgroundPaint(Color.white); chart.setBorderVisible(true); chart.setBorderPaint(Color.BLACK); // Render image try { ChartUtilities.saveChartAsPNG(outputFile, chart, pxWidth, pxHeight); } catch (IOException e) { LOG.error("Chart export failed", e); } } /** * Create a XYDataset based on the given arguments. * * @param name The name * @param timeValues The timevalues * @param values the values * @return a DefaultXYDataset. */ private XYDataset createXYDataSet(String name, double[] timeValues, double[] values) { DefaultXYDataset ds = new DefaultXYDataset(); ds.addSeries(name, new double[][] {timeValues, values}); return ds; } /** * Find the maximum of the values given. If this maximum is less than {@link Double#MIN_VALUE} * then {@link Double#MIN_VALUE} is returned. * * @param values an array of doubles * @return the maximum of the values given */ private double getMaxValue(double[] values) { double max = Double.MIN_VALUE; for (double v : values) { max = Math.max(v, max); } return max; } /** * Set the axis range. * * @param axis a numberAxis * @param range a range */ private void setAxisRange(NumberAxis axis, double[] range) { if (range == null || range.length != 2) { axis.setAutoRange(true); } else { double lower = range[0]; double upper = range[1]; ArgumentNotValid.checkTrue(lower < upper, "Incorrect range"); axis.setAutoRange(false); axis.setRange(new Range(lower, upper)); } } }
/** * Package private constructor for common initialisation. * * @param harvestID the id of the harvestdefinition * @param cfg the configuration to base the Job on * @param orderXMLdoc * @param channel the channel on which the job will be submitted. * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, * overrides individual configuration settings. -1 means no limit * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for * no limit. * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job * @param harvestNum the run number of the harvest definition * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit < * -1 */ public Job( Long harvestID, DomainConfiguration cfg, HeritrixTemplate orderXMLdoc, HarvestChannel channel, long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime, int harvestNum) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(cfg, "cfg"); ArgumentNotValid.checkNotNull(harvestID, "harvestID"); ArgumentNotValid.checkNotNegative(harvestID, "harvestID"); ArgumentNotValid.checkNotNull(channel, "channel"); if (forceMaxObjectsPerDomain < -1) { String msg = "forceMaxObjectsPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain < -1) { String msg = "forceMaxBytesPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain == 0L) { log.warn( "forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain"); } if (forceMaxObjectsPerDomain == 0L) { log.warn( "forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain"); } // setup initial members domainConfigurationMap = new HashMap<>(); origHarvestDefinitionID = harvestID; orderXMLname = cfg.getOrderXmlName(); this.orderXMLdoc = orderXMLdoc; setHarvestChannel(channel); long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects()); setMaxObjectsPerDomain(maxObjects); configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain); long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes()); setMaxBytesPerDomain(maxBytes); configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain); long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); maxCountObjects = expectation; minCountObjects = expectation; this.harvestNum = harvestNum; addConfiguration(cfg); setMaxJobRunningTime(forceMaxJobRunningTime); setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT)); setAttributes(cfg.getAttributesAndTypes()); orderXMLdoc.enableOrDisableDeduplication( Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED)); status = JobStatus.NEW; }
/** * This class represents one job to run by Heritrix. It's based on a number of configurations all * based on the same order.xml and at most one configuration for each domain. Each job consists of * configurations of the approximate same size; that is the difference in expectation from the * smallest configuration to the largest configuration is within a factor of each other defined as * limMaxRelSize (although differences smaller than limMinAbsSize are ignored) There is a limit * limMaxTotalSize on the total size of the job in objects. * * <p>A job may also be limited on bytes or objects, defined either by the configurations in the job * or the harvest definition the job is generated by. * * <p>The job contains the order file, the seedlist and the current status of the job, as well as * the ID of the harvest definition that defined it and names of all the configurations it is based * on. */ @SuppressWarnings({"serial"}) public class Job implements Serializable, JobInfo { private static final transient Logger log = LoggerFactory.getLogger(Job.class); // Persistent fields stored in and read from DAO /** The persistent ID of this job. */ private Long jobID; /** The Id of the harvestdefinition, that generated this job. */ protected Long origHarvestDefinitionID; /** The status of the job. See the JobStatus class for the possible states. */ protected JobStatus status; /** The name of the {@link HarvestChannel} on which this job will be posted. */ private String channel; /** Whether the job belongs to a snapshot or partial harvest. */ private boolean isSnapshot; /** * Overrides the individual configurations maximum setting for objects retrieved from a domain * when set to a positive value. */ private long forceMaxObjectsPerDomain = Constants.HERITRIX_MAXOBJECTS_INFINITY; /** * Overrides the individual configurations maximum setting for bytes retrieved from a domain when * set to other than -1. */ private long forceMaxBytesPerDomain = Constants.HERITRIX_MAXBYTES_INFINITY; /** The name of the harvest template used by the job. */ private String orderXMLname; /** The harvest template used by the job. */ private HeritrixTemplate orderXMLdoc; /** The list of Heritrix settings files. */ private File[] settingsXMLfiles; /** The corresponding Dom4j Documents for these files. */ // private Document[] settingsXMLdocs; /** * A set of seeds involved in this job. Outside the SetSeedList() method, the set of seeds is * updated in the addConfiguration() method. */ private Set<String> seedListSet = new HashSet<String>(); /** Which run of the harvest definition this is. */ private int harvestNum; /** Errors during harvesting. */ private String harvestErrors; /** Details about errors during harvesting. */ private String harvestErrorDetails; /** Errors during upload of the harvested data. */ private String uploadErrors; /** Details about errors during upload of the harvested data. */ private String uploadErrorDetails; /** The starting point of the job. */ private Date actualStart; /** The ending point of the job. */ private Date actualStop; /** The time when this job was submitted. */ private Date submittedDate; /** The time when this job was created. */ private Date creationDate; /** Edition is used by the DAO to keep track of changes. */ private long edition = -1; /** Resubmitted as the Job with this ID. If null, this job has not been resubmitted. */ private Long resubmittedAsJobWithID; /** Continuation of this job. */ private Long continuationOF; /** * A map (domainName, domainConfigurationName), must be accessible in order to update job * information (see Ass. 2.4.3) */ private Map<String, String> domainConfigurationMap; /** * A hint to the DAO that configurations have changed. Since configurations are large, the DAO can * use that this is false to avoid updating the config list. The DAO can set it to false after * saving configurations. */ boolean configsChanged = false; // Intermediate fields, non-persistent and only used while building objects /** * Whether the maxObjects field was defined by the harvest definition or the configuration limit. * This is deciding for whether we accept smaller configurations or not when building jobs. True * means the limit is defined by the configuration, false means that it is defined by the harvest * definition. */ private boolean configurationSetsObjectLimit; /** * Whether the maxBytes field was defined by the harvest definition or the configuration limit. * This is deciding for whether we accept smaller configurations or not when building jobs. True * means the limit is defined by the configuration, false means by the harvest definition. */ private boolean configurationSetsByteLimit; /** The lowest number of objects expected by a configuration. */ private long minCountObjects; /** The highest number of objects expected by a configuration. */ private long maxCountObjects; /** The total number of objects expected by all added configurations. */ private long totalCountObjects; /** The max time in seconds given to the harvester for this job. 0 is unlimited. */ private long forceMaxRunningTime; /** * If true, this job object is still undergoing changes due to having more configurations added. * When set to false, the object is no longer considered immutable except for updating status. * * <p>Jobs loaded from the DAO are never under construction anymore. */ private boolean underConstruction = true; // Constants // Note: The following constants are intentionally left non-static for easy // unit testing private boolean maxObjectsIsSetByQuotaEnforcer = Settings.getBoolean(HarvesterSettings.OBJECT_LIMIT_SET_BY_QUOTA_ENFORCER); /** * The harvestname prefix used in the files generated by Heritrix. Is set using an * ArchiveFileNaming class when the jobID is available. */ private String harvestnamePrefix; /** This variable is right now the same as harvestdefinitions.audience field. */ private String harvestAudience; protected Job() { this.status = JobStatus.NEW; } /** * Package private constructor for common initialisation. * * @param harvestID the id of the harvestdefinition * @param cfg the configuration to base the Job on * @param orderXMLdoc * @param channel the channel on which the job will be submitted. * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, * overrides individual configuration settings. -1 means no limit * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for * no limit. * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job * @param harvestNum the run number of the harvest definition * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit < * -1 */ public Job( Long harvestID, DomainConfiguration cfg, HeritrixTemplate orderXMLdoc, HarvestChannel channel, long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime, int harvestNum) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(cfg, "cfg"); ArgumentNotValid.checkNotNull(harvestID, "harvestID"); ArgumentNotValid.checkNotNegative(harvestID, "harvestID"); ArgumentNotValid.checkNotNull(channel, "channel"); if (forceMaxObjectsPerDomain < -1) { String msg = "forceMaxObjectsPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain < -1) { String msg = "forceMaxBytesPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain == 0L) { log.warn( "forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain"); } if (forceMaxObjectsPerDomain == 0L) { log.warn( "forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain"); } // setup initial members domainConfigurationMap = new HashMap<>(); origHarvestDefinitionID = harvestID; orderXMLname = cfg.getOrderXmlName(); this.orderXMLdoc = orderXMLdoc; setHarvestChannel(channel); long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects()); setMaxObjectsPerDomain(maxObjects); configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain); long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes()); setMaxBytesPerDomain(maxBytes); configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain); long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); maxCountObjects = expectation; minCountObjects = expectation; this.harvestNum = harvestNum; addConfiguration(cfg); setMaxJobRunningTime(forceMaxJobRunningTime); setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT)); setAttributes(cfg.getAttributesAndTypes()); orderXMLdoc.enableOrDisableDeduplication( Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED)); status = JobStatus.NEW; } public void setAttributes(List<AttributeAndType> attributesAndTypes) { orderXMLdoc.insertAttributes(attributesAndTypes); } /** Update the order template according to the chosen archive format (arc/warc). */ private void setArchiveFormatInTemplate(String archiveFormat) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } orderXMLdoc.setArchiveFormat(archiveFormat); } /** * Create a new Job object from basic information stored in the DAO. * * @param harvestID the id of the harvestdefinition * @param configurations the configurations to base the Job on * @param channel the name of the channel on which the job will be submitted. * @param snapshot whether the job belongs to a snapshot harvest * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, * overrides individual configuration settings. 0 means no limit. * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for * no limit. * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job * @param status the current status of the job. * @param orderXMLname the name of the order template used. * @param orderXMLdoc the (possibly modified) template * @param seedlist the combined seedlist from all configs. * @param harvestNum the run number of the harvest definition */ Job( Long harvestID, Map<String, String> configurations, String channel, boolean snapshot, long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime, JobStatus status, String orderXMLname, HeritrixTemplate orderXMLdoc, String seedlist, int harvestNum, Long continuationOf) { origHarvestDefinitionID = harvestID; domainConfigurationMap = configurations; this.channel = channel; this.isSnapshot = snapshot; this.forceMaxBytesPerDomain = forceMaxBytesPerDomain; this.forceMaxObjectsPerDomain = forceMaxObjectsPerDomain; this.forceMaxRunningTime = forceMaxJobRunningTime; this.status = status; this.orderXMLname = orderXMLname; this.orderXMLdoc = orderXMLdoc; this.setSeedList(seedlist); this.harvestNum = harvestNum; this.continuationOF = continuationOf; underConstruction = false; } /** * Adds a configuration to this Job. Seedlists and settings are updated accordingly. * * @param cfg the configuration to add * @throws ArgumentNotValid if cfg is null or cfg uses a different orderxml than this job or if * this job already contains a configuration associated with domain of configuration cfg. */ public void addConfiguration(DomainConfiguration cfg) { ArgumentNotValid.checkNotNull(cfg, "cfg"); if (domainConfigurationMap.containsKey(cfg.getDomainName())) { throw new ArgumentNotValid( "Job already has a configuration for Domain " + cfg.getDomainName()); } if (log.isTraceEnabled()) { log.trace("Adding configuration '{}' to job '{}'", cfg, cfg.getName()); } if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } if (!cfg.getOrderXmlName().equals(getOrderXMLName())) { throw new ArgumentNotValid( "Job requires the orderxml file:'" + getOrderXMLName() + "' not:'" + cfg.getOrderXmlName() + "' used by the configuration:'" + cfg.getName()); } domainConfigurationMap.put(cfg.getDomainName(), cfg.getName()); // Add the seeds from the configuration to the Job seeds. // Take care of duplicates. for (Iterator<SeedList> itt = cfg.getSeedLists(); itt.hasNext(); ) { SeedList seed = itt.next(); List<String> seeds = seed.getSeeds(); for (String seedUrl : seeds) { seedListSet.add(seedUrl); // duplicates is silently ignored // TODO remove when heritrix implements this functionality // try to convert a seed into a Internationalized Domain Name try { String seedASCII = seedUrl; // It is rare to see these seeds, but they need to be // correctly idnaized if (seedUrl.contains(":") || seedUrl.contains("/")) { String normalizedUrl = seedUrl; if (!normalizedUrl.matches("^[a-zA-Z]+:.*")) { // If no protocol is given, assume http normalizedUrl = "http://" + normalizedUrl; } URL url = new URL(normalizedUrl); String domainName = url.getHost(); String domainNameASCII = IDNA.toASCII(domainName); if (!domainName.equals(domainNameASCII)) { // If the domain name changed, replace that in the // seed. seedASCII = seedUrl.replaceFirst(Pattern.quote(domainName), domainNameASCII); } } else { seedASCII = IDNA.toASCII(seedUrl); } if (!seedASCII.equals(seedUrl)) { log.trace("Converted {} to {}", seedUrl, seedASCII); // Note that duplicates is silently ignored seedListSet.add(seedASCII); } } catch (IDNAException e) { log.trace("Cannot convert seed {} to ASCII", seedUrl, e); } catch (MalformedURLException e) { log.trace("Cannot convert seed {} to ASCII", seedUrl, e); } } } orderXMLdoc.editOrderXMLAddPerDomainCrawlerTraps(cfg); // TODO update limits in settings files - see also bug 269 // Update estimates of job size long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); maxCountObjects = Math.max(expectation, maxCountObjects); minCountObjects = Math.min(expectation, minCountObjects); totalCountObjects += expectation; configsChanged = true; assert (maxCountObjects >= minCountObjects) : "basic invariant"; } /** * Get the name of the order XML file used by this Job. * * @return the name of the orderXML file */ public String getOrderXMLName() { return orderXMLname; } /** * Get the actual time when this job was stopped/completed. * * @return the time as Date */ public Date getActualStop() { return actualStop; } /** * Get the actual time when this job was started. * * @return the time as Date */ public Date getActualStart() { return actualStart; } /** * Get the time when this job was submitted. * * @return the time as Date */ public Date getSubmittedDate() { return submittedDate; } /** * Get the time when this job was created. * * @return the creation time as a <code>Date</code> */ public Date getCreationDate() { return creationDate; } /** * Get a list of Heritrix settings.xml files. Note that these files have nothing to do with * NetarchiveSuite settings files. They are files that supplement the Heritrix order.xml files, * and contain overrides for specific domains. * * @return the list of Files as an array */ public File[] getSettingsXMLfiles() { return settingsXMLfiles; } /** * Get the id of the HarvestDefinition from which this job originates. * * @return the id as a Long */ public Long getOrigHarvestDefinitionID() { return origHarvestDefinitionID; } /** * Get the id of this Job. * * @return the id as a Long */ public Long getJobID() { return jobID; } /** * Set the id of this Job. * * @param id The Id for this job. */ public void setJobID(Long id) { jobID = id; } /** * Get's the total number of different domains harvested by this job. * * @return the number of configurations added to this domain */ public int getCountDomains() { return domainConfigurationMap.size(); } /** * Set the actual time when this job was started. * * <p>Sends a notification, if actualStart is set to a time after actualStop. * * @param actualStart A Date object representing the time when this job was started. */ public void setActualStart(Date actualStart) { ArgumentNotValid.checkNotNull(actualStart, "actualStart"); if (actualStop != null && actualStop.before(actualStart)) { log.warn( "Job(" + getJobID() + "): Start time (" + actualStart + ") is after end time: " + actualStop); } this.actualStart = (Date) actualStart.clone(); } /** * Set the actual time when this job was stopped/completed. Sends a notification, if actualStop is * set to a time before actualStart. * * @param actualStop A Date object representing the time when this job was stopped. * @throws ArgumentNotValid */ public void setActualStop(Date actualStop) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(actualStop, "actualStop"); if (actualStart == null) { log.warn("Job(" + getJobID() + "): actualStart should be defined before setting actualStop"); } else if (actualStop.before(actualStart)) { log.warn( "Job(" + getJobID() + "): actualStop (" + actualStop + ") is before actualStart: " + actualStart); } this.actualStop = (Date) actualStop.clone(); } /** * Set the orderxml for this job. * * @param doc A orderxml to be used by this job */ public void setOrderXMLDoc(HeritrixTemplate doc) { ArgumentNotValid.checkNotNull(doc, "doc"); this.orderXMLdoc = doc; } /** * Gets a document representation of the order.xml associated with this Job. * * @return the XML as a org.dom4j.Document */ public HeritrixTemplate getOrderXMLdoc() { return orderXMLdoc; } // /** // * Gets a list of document representations of the settings.xml's associated with this Job. // * // * @return the XML as an array of org.dom4j.Document // */ // public Document[] getSettingsXMLdocs() { // return settingsXMLdocs; // } /** * Set the seedlist of the job from the seedList argument. Individual seeds are separated by a * '\n' character. Duplicate seeds are removed. * * @param seedList List of seeds as one String */ public void setSeedList(String seedList) { ArgumentNotValid.checkNotNullOrEmpty(seedList, "seedList"); seedListSet = new HashSet<>(); BufferedReader reader = new BufferedReader(new StringReader(seedList)); String seed; try { while ((seed = reader.readLine()) != null) { seedListSet.add(seed); // add to seedlist if not already there } } catch (IOException e) { // This never happens, as we're reading from a string! throw new IOFailure("IOException reading from seed string", e); } finally { IOUtils.closeQuietly(reader); } } /** * Get the seedlist as a String. The individual seeds are separated by the character '\n'. The * order of the seeds are unknown. * * @return the seedlist as a String */ public String getSeedListAsString() { return StringUtils.conjoin("\n", seedListSet); } /** * Get the current status of this Job. * * @return the status as an int in the range 0 to 4. */ public JobStatus getStatus() { return status; } /** * Sets status of this job. * * @param newStatus Must be one of the values STATUS_NEW, ..., STATUS_FAILED * @throws ArgumentNotValid in case of invalid status argument or invalid status change */ public void setStatus(JobStatus newStatus) { ArgumentNotValid.checkNotNull(newStatus, "newStatus"); if (!status.legalChange(newStatus)) { final String message = "Status change from " + status + " to " + newStatus + " is not allowed"; log.debug(message); throw new ArgumentNotValid(message); } if ((this.status == JobStatus.NEW || this.status == JobStatus.RESUBMITTED) && newStatus == JobStatus.SUBMITTED) { orderXMLdoc.configureQuotaEnforcer( maxObjectsIsSetByQuotaEnforcer, forceMaxBytesPerDomain, forceMaxObjectsPerDomain); } if (this.status == JobStatus.SUBMITTED && newStatus == JobStatus.STARTED) { setActualStart(new Date()); } if (this.status == JobStatus.STARTED && (newStatus == JobStatus.DONE || newStatus == JobStatus.FAILED)) { setActualStop(new Date()); } status = newStatus; } /** * Returns a map of domain names and name of their corresponding configuration. * * <p>The returned Map cannot be changed. * * @return a read-only Map (<String>, <String>) */ public Map<String, String> getDomainConfigurationMap() { return Collections.unmodifiableMap(domainConfigurationMap); } /** * Gets the maximum number of objects harvested per domain. * * @return The maximum number of objects harvested per domain. 0 means no limit. */ public long getMaxObjectsPerDomain() { return forceMaxObjectsPerDomain; } /** * Gets the maximum number of bytes harvested per domain. * * @return The maximum number of bytes harvested per domain. -1 means no limit. */ public long getMaxBytesPerDomain() { return forceMaxBytesPerDomain; } /** * Get the edition number. * * @return The edition number */ long getEdition() { return edition; } /** * Set the edition number. * * @param edition the new edition number */ void setEdition(long edition) { this.edition = edition; } public void setHarvestChannel(HarvestChannel harvestChannel) { this.channel = harvestChannel.getName(); this.isSnapshot = harvestChannel.isSnapshot(); } /** @return the associated {@link HarvestChannel} name. */ public String getChannel() { return channel; } /** * Sets the associated {@link HarvestChannel} name. * * @param channel the channel name */ public void setChannel(String channel) { this.channel = channel; } /** * @return true if the job belongs to a snapshot harvest, false if it belongs to a focused * harvest. */ public boolean isSnapshot() { return isSnapshot; } /** * Sets whether job belongs to a snapshot or focused harvest. * * @param isSnapshot true if the job belongs to a snapshot harvest, false if it belongs to a * focused harvest. */ public void setSnapshot(boolean isSnapshot) { this.isSnapshot = isSnapshot; } @Override public String toString() { return "Job " + getJobID() + " (state = " + getStatus() + ", HD = " + getOrigHarvestDefinitionID() + ", channel = " + getChannel() + ", snapshot = " + isSnapshot() + ", forcemaxcount = " + getForceMaxObjectsPerDomain() + ", forcemaxbytes = " + getMaxBytesPerDomain() + ", forcemaxrunningtime = " + forceMaxRunningTime + ", orderxml = " + getOrderXMLName() + ", numconfigs = " + getDomainConfigurationMap().size() + ", created = " + getCreationDate() + (getSubmittedDate() != null ? ", submitted = " + getSubmittedDate() : "") + (getActualStart() != null ? ", started = " + getActualStart() : "") + (getActualStop() != null ? ", stopped = " + getActualStop() : "") + ")"; } /** @return Returns the forceMaxObjectsPerDomain. 0 means no limit. */ public long getForceMaxObjectsPerDomain() { return forceMaxObjectsPerDomain; } /** * Sets the maxObjectsPerDomain value. * * @param maxObjectsPerDomain The forceMaxObjectsPerDomain to set. 0 means no limit. * @throws IOFailure Thrown from auxiliary method editOrderXML_maxObjectsPerDomain. */ protected void setMaxObjectsPerDomain(long maxObjectsPerDomain) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } this.forceMaxObjectsPerDomain = maxObjectsPerDomain; orderXMLdoc.setMaxObjectsPerDomain( maxObjectsPerDomain); // FIXME? add argument to maxObjectsIsSetByQuotaEnforcer to method // setMaxObjectsPerDomain // orderXMLdoc.editOrderXML_maxObjectsPerDomain(orderXMLdoc, maxObjectsPerDomain, // maxObjectsIsSetByQuotaEnforcer); if (0L == maxObjectsPerDomain && 0L != forceMaxBytesPerDomain) { setMaxBytesPerDomain(0L); } } /** * Set the maxbytes per domain value. * * @param maxBytesPerDomain The maxBytesPerDomain to set, or -1 for no limit. */ protected void setMaxBytesPerDomain(long maxBytesPerDomain) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } this.forceMaxBytesPerDomain = maxBytesPerDomain; orderXMLdoc.setMaxBytesPerDomain(maxBytesPerDomain); if (0L == maxBytesPerDomain && 0L != forceMaxObjectsPerDomain) { setMaxObjectsPerDomain(0L); } } /** * Set the maxJobRunningTime value. * * @param maxJobRunningTime The maxJobRunningTime in seconds to set, or 0 for no limit. */ protected void setMaxJobRunningTime(long maxJobRunningTime) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } this.forceMaxRunningTime = maxJobRunningTime; orderXMLdoc.setMaxJobRunningTime(maxJobRunningTime); } /** @return Returns the MaxJobRunningTime. 0 means no limit. */ public long getMaxJobRunningTime() { return forceMaxRunningTime; } /** * Get the harvestNum for this job. The number reflects which run of the harvest definition this * is. * * @return the harvestNum for this job. */ public int getHarvestNum() { return harvestNum; } /** * Set the harvestNum for this job. The number reflects which run of the harvest definition this * is. ONLY TO BE USED IN THE CONSTRUCTION PHASE. * * @param harvestNum a given harvestNum */ public void setHarvestNum(int harvestNum) { if (!underConstruction) { final String msg = "Cannot modify job " + this + " as it is no longer under construction"; log.debug(msg); throw new IllegalState(msg); } this.harvestNum = harvestNum; } /** * Get the list of harvest errors for this job. If no harvest errors, null is returned This value * is not meaningful until the job is finished (FAILED,DONE, RESUBMITTED) * * @return the harvest errors for this job or null if no harvest errors. */ public String getHarvestErrors() { return harvestErrors; } /** * Append to the list of harvest errors for this job. Nothing happens, if argument harvestErrors * is null. * * @param harvestErrors a string containing harvest errors (may be null) */ public void appendHarvestErrors(String harvestErrors) { if (harvestErrors != null) { if (this.harvestErrors == null) { this.harvestErrors = harvestErrors; } else { this.harvestErrors += "\n" + harvestErrors; } } } /** * Get the list of harvest error details for this job. If no harvest error details, null is * returned This value is not meaningful until the job is finished (FAILED,DONE, RESUBMITTED) * * @return the list of harvest error details for this job or null if no harvest error details. */ public String getHarvestErrorDetails() { return harvestErrorDetails; } /** * Append to the list of harvest error details for this job. Nothing happens, if argument * harvestErrorDetails is null. * * @param harvestErrorDetails a string containing harvest error details. */ public void appendHarvestErrorDetails(String harvestErrorDetails) { if (harvestErrorDetails != null) { if (this.harvestErrorDetails == null) { this.harvestErrorDetails = harvestErrorDetails; } else { this.harvestErrorDetails += "\n" + harvestErrorDetails; } } } /** * Get the list of upload errors. If no upload errors, null is returned. This value is not * meaningful until the job is finished (FAILED,DONE, RESUBMITTED) * * @return the list of upload errors as String, or null if no upload errors. */ public String getUploadErrors() { return uploadErrors; } /** * Append to the list of upload errors. Nothing happens, if argument uploadErrors is null. * * @param uploadErrors a string containing upload errors. */ public void appendUploadErrors(String uploadErrors) { if (uploadErrors != null) { if (this.uploadErrors == null) { this.uploadErrors = uploadErrors; } else { this.uploadErrors += "\n" + uploadErrors; } } } /** * Get the list of upload error details. If no upload error details, null is returned. This value * is not meaningful until the job is finished (FAILED,DONE, RESUBMITTED) * * @return the list of upload error details as String, or null if no upload error details */ public String getUploadErrorDetails() { return uploadErrorDetails; } /** * Append to the list of upload error details. Nothing happens, if argument uploadErrorDetails is * null. * * @param uploadErrorDetails a string containing upload error details. */ public void appendUploadErrorDetails(String uploadErrorDetails) { if (uploadErrorDetails != null) { if (this.uploadErrorDetails == null) { this.uploadErrorDetails = uploadErrorDetails; } else { this.uploadErrorDetails += "\n" + uploadErrorDetails; } } } /** * Get the ID for the job which this job was resubmitted as. If null, this job has not been * resubmitted. * * @return this ID. */ public Long getResubmittedAsJob() { return resubmittedAsJobWithID; } /** * Set the Date for when this job was submitted. If null, this job has not been submitted. * * @param submittedDate The date when this was submitted */ public void setSubmittedDate(Date submittedDate) { this.submittedDate = submittedDate; } /** * Set the Date for when this job was created. If null, this job has not been created. * * @param creationDate The date when this was created */ public void setCreationDate(Date creationDate) { this.creationDate = creationDate; } /** * Set the ID for the job which this job was resubmitted as. * * @param resubmittedAsJob An Id for a new job. */ public void setResubmittedAsJob(Long resubmittedAsJob) { this.resubmittedAsJobWithID = resubmittedAsJob; } /** * @return id of the job that this job is supposed to continue using Heritrix recover-log or null * if it starts from scratch. */ public Long getContinuationOf() { return this.continuationOF; } @Override public String getHarvestFilenamePrefix() { if (this.harvestnamePrefix == null) { log.warn( "HarvestnamePrefix not yet set for job {}. Set it by using the naming scheme. " + "This should only happen for old jobs being read", this.jobID); setDefaultHarvestNamePrefix(); } return this.harvestnamePrefix; } /** @param prefix */ public void setHarvestFilenamePrefix(String prefix) { this.harvestnamePrefix = prefix; } /** @return the forceMaxBytesPerDomain */ public long getForceMaxBytesPerDomain() { return forceMaxBytesPerDomain; } /** @return the configurationSetsObjectLimit */ public boolean isConfigurationSetsObjectLimit() { return configurationSetsObjectLimit; } /** @return the configurationSetsByteLimit */ public boolean isConfigurationSetsByteLimit() { return configurationSetsByteLimit; } /** @return the minCountObjects */ public long getMinCountObjects() { return minCountObjects; } /** @return the maxCountObjects */ public long getMaxCountObjects() { return maxCountObjects; } /** @return the totalCountObjects */ public long getTotalCountObjects() { return totalCountObjects; } void setDefaultHarvestNamePrefix() { if (getJobID() != null) { ArchiveFileNaming naming = ArchiveFileNamingFactory.getInstance(); log.debug("Applying the default ArchiveFileNaming class '{}'.", naming.getClass().getName()); final String prefix = naming.getPrefix(this); setHarvestFilenamePrefix(prefix); log.debug("The harvestPrefix of this job is: {}", prefix); } else { log.warn( "The harvestnamePrefix is not set now, as it depends on the JobID, which is not set yet"); } } /** @return the harvest-audience. */ public String getHarvestAudience() { return harvestAudience; } /** * Set the harvest audience for this job. Taken from the harvestdefinition that generated this * job. * * @param theAudience the harvest-audience. */ public void setHarvestAudience(String theAudience) { this.harvestAudience = theAudience; } ///////////// The following two methods are needed by harvestStatus-jobdetails.jsp // //////////////////////////////////// /** * Returns a list of sorted seeds for this job. The sorting is by domain, and inside each domain, * the list is sorted by url * * @return a list of sorted seeds for this job. */ public List<String> getSortedSeedList() { Map<String, Set<String>> urlMap = new HashMap<String, Set<String>>(); for (String seed : seedListSet) { String url; // Assume the protocol is http://, if it is missing if (!seed.matches(Constants.PROTOCOL_REGEXP)) { url = "http://" + seed; } else { url = seed; } String domain = getDomain(url); if (domain == null) { // stop processing this url, and continue to the next seed continue; } Set<String> set; if (urlMap.containsKey(domain)) { set = urlMap.get(domain); } else { set = new TreeSet<String>(); urlMap.put(domain, set); } set.add(seed); } List<String> result = new ArrayList<String>(); for (Set<String> set : urlMap.values()) { result.addAll(set); } return result; } /** * Get the domain, that the given URL belongs to. * * @param url an URL * @return the domain, that the given URL belongs to, or null if unable to do so. */ private String getDomain(String url) { try { URL uri = new URL(url); return DomainUtils.domainNameFromHostname(uri.getHost()); } catch (MalformedURLException e) { log.warn("The string '{}' is not a valid URL", url); return null; } } }
/** Unit test for the archive upload tool. */ public class UploadTester { private UseTestRemoteFile ulrf = new UseTestRemoteFile(); private PreventSystemExit pse = new PreventSystemExit(); private PreserveStdStreams pss = new PreserveStdStreams(true); private MoveTestFiles mtf = new MoveTestFiles(TestInfo.DATA_DIR, TestInfo.WORKING_DIR); private MockupJMS mjms = new MockupJMS(); private MockupArcRepositoryClient marc = new MockupArcRepositoryClient(); ReloadSettings rs = new ReloadSettings(); /** Max number of store retries. */ private final int storeRetries = Settings.getInt(JMSArcRepositoryClient.ARCREPOSITORY_STORE_RETRIES); @Before public void setUp() { rs.setUp(); ulrf.setUp(); mjms.setUp(); mtf.setUp(); pss.setUp(); pse.setUp(); marc.setUp(); Settings.set(CommonSettings.NOTIFICATIONS_CLASS, RememberNotifications.class.getName()); } @After public void tearDown() { marc.tearDown(); pse.tearDown(); pss.tearDown(); mtf.tearDown(); mjms.tearDown(); ulrf.tearDown(); RememberNotifications.resetSingleton(); rs.tearDown(); } @Test public void testConstructor() { ReflectUtils.testUtilityConstructor(Upload.class); } /** Verify that uploading a single ARC file works as expected and deletes the file locally. */ @Test public void testMainOneFile() { Upload.main(new String[] {TestInfo.ARC1.getAbsolutePath()}); assertMsgCount(1, 0); assertStoreStatus(0, TestInfo.ARC1, true); } /** * Verify that uploading more than one ARC file works as expected and deletes the files locally. */ @Test public void testMainSeveralFiles() { Upload.main(new String[] {TestInfo.ARC1.getAbsolutePath(), TestInfo.ARC2.getAbsolutePath()}); assertMsgCount(2, 0); assertStoreStatus(0, TestInfo.ARC1, true); assertStoreStatus(1, TestInfo.ARC2, true); } /** * Verify that non-ARC files are rejected and execution fails. Also verifies that nothing is * stored in that case. */ @Test public void testMainNonArc() { try { Upload.main( new String[] {TestInfo.ARC1.getAbsolutePath(), TestInfo.INDEX_DIR.getAbsolutePath()}); fail("Calling Upload with non-arc file should System.exit"); } catch (SecurityException e) { // Expected assertMsgCount(0, 0); } } /** Verify that uploading a single WARC file works as expected and deletes the file locally. */ @Test public void testMainOneWarcFile() { Upload.main(new String[] {TestInfo.WARC1.getAbsolutePath()}); assertMsgCount(1, 0); assertStoreStatus(0, TestInfo.WARC1, true); } /** * Verify that the system fails as expected when the store operation fails on the server side. * (Local files must NOT be deleted). */ @Test public void testMainStoreFails1() { marc.failOnFile(TestInfo.ARC1.getName()); Upload.main( new String[] { TestInfo.ARC1.getAbsolutePath(), TestInfo.ARC2.getAbsolutePath(), TestInfo.ARC3.getAbsolutePath() }); assertMsgCount(2, 1); int index = 0; for (int i = 0; i < storeRetries; i++) { assertStoreStatus(index, TestInfo.ARC1, false); index++; } assertStoreStatus(index, TestInfo.ARC2, true); index++; assertStoreStatus(index, TestInfo.ARC3, true); } /** * Verify that the system fails as expected when the store operation fails on the server side. * (Local files must NOT be deleted). */ @Test public void testMainStoreFails2() { marc.failOnFile(TestInfo.ARC2.getName()); Upload.main( new String[] { TestInfo.ARC1.getAbsolutePath(), TestInfo.ARC2.getAbsolutePath(), TestInfo.ARC3.getAbsolutePath() }); assertMsgCount(2, 1); int index = 0; assertStoreStatus(index, TestInfo.ARC1, true); index++; for (int i = 0; i < storeRetries; i++) { assertStoreStatus(index, TestInfo.ARC2, false); index++; } assertStoreStatus(index, TestInfo.ARC3, true); } /** * Verify that the system fails as expected when the store operation fails on the server side. * (Local files must NOT be deleted). */ @Test public void testMainStoreFails3() { marc.failOnFile(TestInfo.ARC3.getName()); Upload.main( new String[] { TestInfo.ARC1.getAbsolutePath(), TestInfo.ARC2.getAbsolutePath(), TestInfo.ARC3.getAbsolutePath() }); assertMsgCount(2, 1); int index = 0; assertStoreStatus(index, TestInfo.ARC1, true); index++; assertStoreStatus(index, TestInfo.ARC2, true); index++; for (int i = 0; i < storeRetries; i++) { assertStoreStatus(index, TestInfo.ARC3, false); index++; } } /** * Verifies that calling Upload without arguments fails. Also verifies that nothing is stored in * that case. */ @Test public void testNoArguments() { try { Upload.main(new String[] {}); fail("Calling Upload without arguments should System.exit"); } catch (SecurityException e) { // Expected assertMsgCount(0, 0); } } /** * Asserts that we got the expected number of StoreMessages. * * @param succeeded Number of files successfully stored * @param failed Number of files that never got stored */ private void assertMsgCount(int succeeded, int failed) { int expected = succeeded + failed * storeRetries; assertEquals( "Upload should generate exactly 1 StoreMessage " + "per succeeded arc file and " + storeRetries + " per failed store", expected, marc.getMsgCount()); } /** * Asserts that the nth StoreMessage is regarding the given arc file and that the arc file is * delete if and only if store succeeded. * * @param n The relevant index to marc.getStoreMsgs() * @param arcFile The arc file that was stored * @param shouldSucceed Whether store was supposed to succeed */ private void assertStoreStatus(int n, File arcFile, boolean shouldSucceed) { StoreMessage sm = marc.getStoreMsgs().get(n); assertEquals( "Upload should attempt to upload the specified files", arcFile.getName(), sm.getArcfileName()); if (shouldSucceed) { assertFalse("Upload should delete a properly uploaded file", arcFile.exists()); } else { assertTrue( "Upload should not delete a file that wasn't " + " properly uploaded", arcFile.exists()); } } }