/**
   * Calls the Unix sort command with the options <code>$filesNames -o
   * $outputfile -T WaybackSettings#WAYBACK_AGGREGATOR_TEMP_DIR.
   *
   * Sets the LC_ALL environment variable before making the call.
   *
   * @param files The files to merge and sort
   * @param outputFile The resulting sorted file
   * @param additionalArgs A list af extra arguments, which (if different from
   *                       null) are added to the sort call.<p> Note: If any
   *                       of the args contain a whitespace the call will
   *                       fail.
   */
  private void processFiles(File[] files, File outputFile, List<String> additionalArgs) {
    if (files.length == 0) {
      // Empty file list will cause sort to wait for further input,
      // and the call will therefore never return
      return;
    }

    Process p = null;

    try {
      List<String> inputFileList = new LinkedList<String>();
      for (int i = 0; i < files.length; i++) {
        if (files[i].exists() && files[i].isFile()) {
          inputFileList.add(files[i].getCanonicalPath());
        } else {
          log.warn(
              "File "
                  + files[i]
                  + " doesn't exist or isn't a regular file, "
                  + "dropping from list of files to "
                  + "sort and merge");
        }
      }
      List<String> cmd = new LinkedList<String>();
      // Prepare to run the unix sort command, see sort manual page for
      // details
      cmd.add("sort");
      cmd.addAll(inputFileList);
      cmd.add("-o");
      cmd.add(outputFile.getCanonicalPath());
      cmd.add("-T");
      cmd.add(Settings.get(WaybackSettings.WAYBACK_AGGREGATOR_TEMP_DIR));
      if (additionalArgs != null && !additionalArgs.isEmpty()) {
        for (String argument : additionalArgs) {
          ArgumentNotValid.checkTrue(
              argument.indexOf(' ') == -1,
              "The argument '" + argument + "' contains spaces, this isn't allowed ");
        }
        cmd.addAll(additionalArgs);
      }
      ProcessBuilder pb = new ProcessBuilder(cmd);
      // Reset all locale definitions
      pb.environment().put("LC_ALL", "C");
      // Run the command in the user.dir directory
      pb.directory(new File(System.getProperty("user.dir")));
      p = pb.start();
      p.waitFor();
      if (p.exitValue() != 0) {
        log.error("Failed to sort index files, sort exited with " + "return code " + p.exitValue());
      }
    } catch (Exception e) {
      log.error("Failed to aggregate indexes ", e);
    }
  }
public class IndexerTestCase {

  private String oldClient = System.getProperty(CommonSettings.ARC_REPOSITORY_CLIENT);
  private String oldFileDir = System.getProperty("settings.common.arcrepositoryClient.fileDir");
  protected static File tempdir = new File(Settings.get(WaybackSettings.WAYBACK_INDEX_TEMPDIR));

  ReloadSettings rs = new ReloadSettings();

  @Before
  public void setUp() {
    rs.setUp();
    System.setProperty(WaybackSettings.HIBERNATE_HBM2DDL_AUTO, "create-drop");
    HibernateUtil.getSession().getSessionFactory().close();
    FileUtils.removeRecursively(TestInfo.WORKING_DIR);
    TestFileUtils.copyDirectoryNonCVS(TestInfo.ORIGINALS_DIR, TestInfo.WORKING_DIR);
    System.setProperty(
        CommonSettings.ARC_REPOSITORY_CLIENT,
        "dk.netarkivet.common.distribute.arcrepository.LocalArcRepositoryClient");
    System.setProperty(
        "settings.common.arcrepositoryClient.fileDir", TestInfo.FILE_DIR.getAbsolutePath());
    System.setProperty(
        CommonSettings.REMOTE_FILE_CLASS, "dk.netarkivet.common.distribute.TestRemoteFile");
    assertTrue(
        ArcRepositoryClientFactory.getPreservationInstance() instanceof LocalArcRepositoryClient);
  }

  @After
  public void tearDown() {
    HibernateUtil.getSession().getSessionFactory().close();
    FileUtils.removeRecursively(TestInfo.WORKING_DIR);
    if (oldClient != null) {
      System.setProperty(CommonSettings.ARC_REPOSITORY_CLIENT, oldClient);
    } else {
      System.setProperty(CommonSettings.ARC_REPOSITORY_CLIENT, "");
    }
    if (oldFileDir != null) {
      System.setProperty("settings.common.arcrepositoryClient.fileDir", oldFileDir);
    } else {
      System.setProperty("settings.common.arcrepositoryClient.fileDir", "");
    }
    rs.tearDown();
  }
}
Пример #3
0
  /**
   * Package private constructor for common initialisation.
   *
   * @param harvestID the id of the harvestdefinition
   * @param cfg the configuration to base the Job on
   * @param orderXMLdoc
   * @param channel the channel on which the job will be submitted.
   * @param forceMaxObjectsPerDomain the maximum number of objects harvested from a domain,
   *     overrides individual configuration settings. -1 means no limit
   * @param forceMaxBytesPerDomain The maximum number of objects harvested from a domain, or -1 for
   *     no limit.
   * @param forceMaxJobRunningTime The max time in seconds given to the harvester for this job
   * @param harvestNum the run number of the harvest definition
   * @throws ArgumentNotValid if cfg or priority is null or harvestID is invalid, or if any limit <
   *     -1
   */
  public Job(
      Long harvestID,
      DomainConfiguration cfg,
      HeritrixTemplate orderXMLdoc,
      HarvestChannel channel,
      long forceMaxObjectsPerDomain,
      long forceMaxBytesPerDomain,
      long forceMaxJobRunningTime,
      int harvestNum)
      throws ArgumentNotValid {
    ArgumentNotValid.checkNotNull(cfg, "cfg");
    ArgumentNotValid.checkNotNull(harvestID, "harvestID");
    ArgumentNotValid.checkNotNegative(harvestID, "harvestID");
    ArgumentNotValid.checkNotNull(channel, "channel");

    if (forceMaxObjectsPerDomain < -1) {
      String msg = "forceMaxObjectsPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }
    if (forceMaxBytesPerDomain < -1) {
      String msg = "forceMaxBytesPerDomain must be either -1 or positive";
      log.debug(msg);
      throw new ArgumentNotValid(msg);
    }

    if (forceMaxBytesPerDomain == 0L) {
      log.warn(
          "forceMaxBytesPerDomain should probably not be 0.Means 0 bytes downloaded per domain");
    }

    if (forceMaxObjectsPerDomain == 0L) {
      log.warn(
          "forceMaxObjectsPerDomain should probably not be 0.Means 0 objects downloaded per domain");
    }

    // setup initial members
    domainConfigurationMap = new HashMap<>();
    origHarvestDefinitionID = harvestID;
    orderXMLname = cfg.getOrderXmlName();
    this.orderXMLdoc = orderXMLdoc;

    setHarvestChannel(channel);

    long maxObjects = NumberUtils.minInf(forceMaxObjectsPerDomain, cfg.getMaxObjects());
    setMaxObjectsPerDomain(maxObjects);
    configurationSetsObjectLimit = (maxObjects != forceMaxObjectsPerDomain);

    long maxBytes = NumberUtils.minInf(forceMaxBytesPerDomain, cfg.getMaxBytes());
    setMaxBytesPerDomain(maxBytes);
    configurationSetsByteLimit = (maxBytes != forceMaxBytesPerDomain);

    long expectation =
        cfg.getExpectedNumberOfObjects(forceMaxObjectsPerDomain, forceMaxBytesPerDomain);
    maxCountObjects = expectation;
    minCountObjects = expectation;
    this.harvestNum = harvestNum;

    addConfiguration(cfg);

    setMaxJobRunningTime(forceMaxJobRunningTime);

    setArchiveFormatInTemplate(Settings.get(HarvesterSettings.HERITRIX_ARCHIVE_FORMAT));

    setAttributes(cfg.getAttributesAndTypes());

    orderXMLdoc.enableOrDisableDeduplication(
        Settings.getBoolean(HarvesterSettings.DEDUPLICATION_ENABLED));

    status = JobStatus.NEW;
  }
/**
 * Batchjob that extracts lines from a crawl log matching a regular expression The batch job should
 * be restricted to run on metadata files for a specific job only, using the {@link
 * #processOnlyFilesMatching(String)} construct.
 */
@SuppressWarnings({"serial"})
public class CrawlLogLinesMatchingRegexp extends ArchiveBatchJob {

  /** The logger. */
  // private final Log log = LogFactory.getLog(getClass().getName());
  private static final Logger log = LoggerFactory.getLogger(CrawlLogLinesMatchingRegexp.class);

  /** Metadata URL for crawl logs. */
  private static final String SETUP_URL_FORMAT =
      String.format(
          "metadata://%s/crawl/logs/crawl.log", Settings.get(CommonSettings.ORGANIZATION));

  /** The regular expression to match in the crawl.log line. */
  private final String regexp;

  /**
   * Initialise the batch job.
   *
   * @param regexp The regexp to match in the crawl.log lines.
   */
  public CrawlLogLinesMatchingRegexp(String regexp) {
    ArgumentNotValid.checkNotNullOrEmpty(regexp, "regexp");
    this.regexp = regexp;

    /** One week in milliseconds. */
    batchJobTimeout = 7 * Constants.ONE_DAY_IN_MILLIES;
  }

  /**
   * Does nothing, no initialisation is needed.
   *
   * @param os Not used.
   */
  @Override
  public void initialize(OutputStream os) {}

  @Override
  public ArchiveBatchFilter getFilter() {
    return new ArchiveBatchFilter("OnlyCrawlLog") {
      public boolean accept(ArchiveRecordBase record) {
        String URL = record.getHeader().getUrl();
        if (URL == null) {
          return false;
        } else {
          return URL.startsWith(SETUP_URL_FORMAT);
        }
      }
    };
  }

  /**
   * Process a record on crawl log concerning the given domain to result.
   *
   * @param record The record to process.
   * @param os The output stream for the result.
   * @throws ArgumentNotValid on null parameters
   * @throws IOFailure on trouble processing the record.
   */
  @Override
  public void processRecord(ArchiveRecordBase record, OutputStream os) {
    ArgumentNotValid.checkNotNull(record, "ArchiveRecordBase record");
    ArgumentNotValid.checkNotNull(os, "OutputStream os");
    BufferedReader arcreader = new BufferedReader(new InputStreamReader(record.getInputStream()));
    try {
      for (String line = arcreader.readLine(); line != null; line = arcreader.readLine()) {
        if (line.matches(regexp)) {
          os.write(line.getBytes("UTF-8"));
          os.write('\n');
        }
      }
    } catch (IOException e) {
      throw new IOFailure("Unable to process (w)arc record", e);
    } finally {
      try {
        arcreader.close();
      } catch (IOException e) {
        log.warn("unable to close arcreader probably", e);
      }
    }
  }

  /**
   * Does nothing, no finishing is needed.
   *
   * @param os Not used.
   */
  @Override
  public void finish(OutputStream os) {}

  @Override
  public String toString() {
    return getClass().getName()
        + ", with arguments: Regexp = "
        + regexp
        + ", Filter = "
        + getFilter();
  }
}