/** * Calls the Unix sort command with the options <code>$filesNames -o * $outputfile -T WaybackSettings#WAYBACK_AGGREGATOR_TEMP_DIR. * * Sets the LC_ALL environment variable before making the call. * * @param files The files to merge and sort * @param outputFile The resulting sorted file * @param additionalArgs A list af extra arguments, which (if different from * null) are added to the sort call.<p> Note: If any * of the args contain a whitespace the call will * fail. */ private void processFiles(File[] files, File outputFile, List<String> additionalArgs) { if (files.length == 0) { // Empty file list will cause sort to wait for further input, // and the call will therefore never return return; } Process p = null; try { List<String> inputFileList = new LinkedList<String>(); for (int i = 0; i < files.length; i++) { if (files[i].exists() && files[i].isFile()) { inputFileList.add(files[i].getCanonicalPath()); } else { log.warn( "File " + files[i] + " doesn't exist or isn't a regular file, " + "dropping from list of files to " + "sort and merge"); } } List<String> cmd = new LinkedList<String>(); // Prepare to run the unix sort command, see sort manual page for // details cmd.add("sort"); cmd.addAll(inputFileList); cmd.add("-o"); cmd.add(outputFile.getCanonicalPath()); cmd.add("-T"); cmd.add(Settings.get(WaybackSettings.WAYBACK_AGGREGATOR_TEMP_DIR)); if (additionalArgs != null && !additionalArgs.isEmpty()) { for (String argument : additionalArgs) { ArgumentNotValid.checkTrue( argument.indexOf(' ') == -1, "The argument '" + argument + "' contains spaces, this isn't allowed "); } cmd.addAll(additionalArgs); } ProcessBuilder pb = new ProcessBuilder(cmd); // Reset all locale definitions pb.environment().put("LC_ALL", "C"); // Run the command in the user.dir directory pb.directory(new File(System.getProperty("user.dir"))); p = pb.start(); p.waitFor(); if (p.exitValue() != 0) { log.error("Failed to sort index files, sort exited with " + "return code " + p.exitValue()); } } catch (Exception e) { log.error("Failed to aggregate indexes ", e); } }
public class IndexerTestCase { private String oldClient = System.getProperty(CommonSettings.ARC_REPOSITORY_CLIENT); private String oldFileDir = System.getProperty("settings.common.arcrepositoryClient.fileDir"); protected static File tempdir = new File(Settings.get(WaybackSettings.WAYBACK_INDEX_TEMPDIR)); ReloadSettings rs = new ReloadSettings(); @Before public void setUp() { rs.setUp(); System.setProperty(WaybackSettings.HIBERNATE_HBM2DDL_AUTO, "create-drop"); HibernateUtil.getSession().getSessionFactory().close(); FileUtils.removeRecursively(TestInfo.WORKING_DIR); TestFileUtils.copyDirectoryNonCVS(TestInfo.ORIGINALS_DIR, TestInfo.WORKING_DIR); System.setProperty( CommonSettings.ARC_REPOSITORY_CLIENT, "dk.netarkivet.common.distribute.arcrepository.LocalArcRepositoryClient"); System.setProperty( "settings.common.arcrepositoryClient.fileDir", TestInfo.FILE_DIR.getAbsolutePath()); System.setProperty( CommonSettings.REMOTE_FILE_CLASS, "dk.netarkivet.common.distribute.TestRemoteFile"); assertTrue( ArcRepositoryClientFactory.getPreservationInstance() instanceof LocalArcRepositoryClient); } @After public void tearDown() { HibernateUtil.getSession().getSessionFactory().close(); FileUtils.removeRecursively(TestInfo.WORKING_DIR); if (oldClient != null) { System.setProperty(CommonSettings.ARC_REPOSITORY_CLIENT, oldClient); } else { System.setProperty(CommonSettings.ARC_REPOSITORY_CLIENT, ""); } if (oldFileDir != null) { System.setProperty("settings.common.arcrepositoryClient.fileDir", oldFileDir); } else { System.setProperty("settings.common.arcrepositoryClient.fileDir", ""); } rs.tearDown(); } }
/** * Package private constructor for common initialisation. * * @param harvestID the id of the harvestdefinition * @param cfg the configuration to base the Job on * @param orderXMLdoc * @param channel the channel on which the job will be submitted. * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain, * overrides individual configuration settings. -1 means no limit * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for * no limit. * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job * @param harvestNum the run number of the harvest definition * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit < * -1 */ public Job( Long harvestID, DomainConfiguration cfg, HeritrixTemplate orderXMLdoc, HarvestChannel channel, long forceMaxObjectsPerDomain, long forceMaxBytesPerDomain, long forceMaxJobRunningTime, int harvestNum) throws ArgumentNotValid { ArgumentNotValid.checkNotNull(cfg, "cfg"); ArgumentNotValid.checkNotNull(harvestID, "harvestID"); ArgumentNotValid.checkNotNegative(harvestID, "harvestID"); ArgumentNotValid.checkNotNull(channel, "channel"); if (forceMaxObjectsPerDomain < -1) { String msg = "forceMaxObjectsPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain < -1) { String msg = "forceMaxBytesPerDomain must be either -1 or positive"; log.debug(msg); throw new ArgumentNotValid(msg); } if (forceMaxBytesPerDomain == 0L) { log.warn( "forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain"); } if (forceMaxObjectsPerDomain == 0L) { log.warn( "forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain"); } // setup initial members domainConfigurationMap = new HashMap<>(); origHarvestDefinitionID = harvestID; orderXMLname = cfg.getOrderXmlName(); this.orderXMLdoc = orderXMLdoc; setHarvestChannel(channel); long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects()); setMaxObjectsPerDomain(maxObjects); configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain); long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes()); setMaxBytesPerDomain(maxBytes); configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain); long expectation = cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain); maxCountObjects = expectation; minCountObjects = expectation; this.harvestNum = harvestNum; addConfiguration(cfg); setMaxJobRunningTime(forceMaxJobRunningTime); setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT)); setAttributes(cfg.getAttributesAndTypes()); orderXMLdoc.enableOrDisableDeduplication( Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED)); status = JobStatus.NEW; }
/** * Batchjob that extracts lines from a crawl log matching a regular expression The batch job should * be restricted to run on metadata files for a specific job only, using the {@link * #processOnlyFilesMatching(String)} construct. */ @SuppressWarnings({"serial"}) public class CrawlLogLinesMatchingRegexp extends ArchiveBatchJob { /** The logger. */ // private final Log log = LogFactory.getLog(getClass().getName()); private static final Logger log = LoggerFactory.getLogger(CrawlLogLinesMatchingRegexp.class); /** Metadata URL for crawl logs. */ private static final String SETUP_URL_FORMAT = String.format( "metadata://%s/crawl/logs/crawl.log", Settings.get(CommonSettings.ORGANIZATION)); /** The regular expression to match in the crawl.log line. */ private final String regexp; /** * Initialise the batch job. * * @param regexp The regexp to match in the crawl.log lines. */ public CrawlLogLinesMatchingRegexp(String regexp) { ArgumentNotValid.checkNotNullOrEmpty(regexp, "regexp"); this.regexp = regexp; /** One week in milliseconds. */ batchJobTimeout = 7 * Constants.ONE_DAY_IN_MILLIES; } /** * Does nothing, no initialisation is needed. * * @param os Not used. */ @Override public void initialize(OutputStream os) {} @Override public ArchiveBatchFilter getFilter() { return new ArchiveBatchFilter("OnlyCrawlLog") { public boolean accept(ArchiveRecordBase record) { String URL = record.getHeader().getUrl(); if (URL == null) { return false; } else { return URL.startsWith(SETUP_URL_FORMAT); } } }; } /** * Process a record on crawl log concerning the given domain to result. * * @param record The record to process. * @param os The output stream for the result. * @throws ArgumentNotValid on null parameters * @throws IOFailure on trouble processing the record. */ @Override public void processRecord(ArchiveRecordBase record, OutputStream os) { ArgumentNotValid.checkNotNull(record, "ArchiveRecordBase record"); ArgumentNotValid.checkNotNull(os, "OutputStream os"); BufferedReader arcreader = new BufferedReader(new InputStreamReader(record.getInputStream())); try { for (String line = arcreader.readLine(); line != null; line = arcreader.readLine()) { if (line.matches(regexp)) { os.write(line.getBytes("UTF-8")); os.write('\n'); } } } catch (IOException e) { throw new IOFailure("Unable to process (w)arc record", e); } finally { try { arcreader.close(); } catch (IOException e) { log.warn("unable to close arcreader probably", e); } } } /** * Does nothing, no finishing is needed. * * @param os Not used. */ @Override public void finish(OutputStream os) {} @Override public String toString() { return getClass().getName() + ", with arguments: Regexp = " + regexp + ", Filter = " + getFilter(); } }