Esempio n. 1
0
 /**
  * Return true if, interpreting URLs as filenames, dirCu is a directory containing fileCu. Used to
  * exclude directory content from output files, so they can be unpacked by standard utilities
  * (e.g., unzip). Shouldn't be called with equal URLs, but return false in that case, as we
  * wouldn't want to exclude the URL
  */
 boolean isDirOf(CachedUrl dirCu, CachedUrl fileCu) {
   String dir = dirCu.getUrl();
   String file = fileCu.getUrl();
   if (!dir.endsWith("/")) {
     dir = dir + "/";
   }
   return file.startsWith(dir) && !file.equals(dir);
 }
 private void deleteBlock(CachedUrl cu) throws IOException {
   log.info("deleting " + cu.getUrl());
   CachedUrlSetSpec cuss = new SingleNodeCachedUrlSetSpec(cu.getUrl());
   ArchivalUnit au = cu.getArchivalUnit();
   CachedUrlSet cus = au.makeCachedUrlSet(cuss);
   NodeManager nm = au.getPlugin().getDaemon().getNodeManager(au);
   nm.deleteNode(cus);
 }
  public void testFunctionalFromTarHierarchy() throws Exception {
    log.debug3("in testFromTarHierarchy");
    // load the tarballs
    InputStream file_input = null;
    try {
      file_input = getResourceAsStream(realTARFile_A);
      // UrlCacher uc = au.makeUrlCacher(TAR_A_BASE);
      // uc.storeContent(file_input, tarHeader);
      UrlCacher uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_A_BASE));
      uc.storeContent();
      IOUtil.safeClose(file_input);

      file_input = getResourceAsStream(realTARFile_B);
      // uc = au.makeUrlCacher(TAR_B_BASE);
      // uc.storeContent(file_input, tarHeader);
      uc = tarAu.makeUrlCacher(new UrlData(file_input, tarHeader, TAR_B_BASE));
      uc.storeContent();
      IOUtil.safeClose(file_input);

    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } finally {
      IOUtil.safeClose(file_input);
    }

    CachedUrlSet cus = tarAu.getAuCachedUrlSet();
    for (CachedUrl cu : cus.getCuIterable()) {
      log.debug3("AU - cu is: " + cu.getUrl());
      cu.release();
    }

    // We need to start from the level of the ArticleMetadataExtractor
    MyListEmitter emitter = new MyListEmitter();
    ArticleMetadataExtractor amEx =
        new ElsevierDeferredArticleMetadataExtractor(ArticleFiles.ROLE_ARTICLE_METADATA);

    Iterator<ArticleFiles> it = tarAu.getArticleIterator(MetadataTarget.Any());
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.debug3("Metadata test - articlefiles " + af.toString());
      // CachedUrl cu = af.getFullTextCu();
      CachedUrl cu = af.getRoleCu(ArticleFiles.ROLE_ARTICLE_METADATA);
      log.debug3("metadata cu is " + cu.getUrl());
      // List<ArticleMetadata> mdlist = mle.extract(MetadataTarget.Any(), cu);
      amEx.extract(MetadataTarget.Any(), af, emitter);
      List<ArticleMetadata> returnList = emitter.getAmList();

      assertNotNull(returnList);
      log.debug3("size of returnList is " + returnList.size());
      Iterator<ArticleMetadata> mdIt = returnList.iterator();
      ArticleMetadata mdRecord = null;
      while (mdIt.hasNext()) {
        mdRecord = (ArticleMetadata) mdIt.next();
        validateCompleteMetadataRecord(mdRecord);
      }
    }
  }
  public void testArticleCountAndType() throws Exception {
    int expCount = 28;
    PluginTestUtil.crawlSimAu(sau);
    String pat1 = "branch(\\d+)/(\\d+file\\.html)";
    String rep1 = "aps/journal/v123/n$1/full/$2";
    PluginTestUtil.copyAu(sau, nau, ".*[^.][^p][^d][^f]$", pat1, rep1);
    String pat2 = "branch(\\d+)/(\\d+file\\.pdf)";
    String rep2 = "aps/journal/v123/n$1/pdf/$2";
    PluginTestUtil.copyAu(sau, nau, ".*\\.pdf$", pat2, rep2);

    // Remove some URLs
    int deleted = 0;
    for (Iterator it = nau.getAuCachedUrlSet().contentHashIterator(); it.hasNext(); ) {
      CachedUrlSetNode cusn = (CachedUrlSetNode) it.next();
      if (cusn instanceof CachedUrl) {
        CachedUrl cu = (CachedUrl) cusn;
        String url = cu.getUrl();
        if (url.contains("/journal/")
            && (url.endsWith("1file.html") || url.endsWith("2file.pdf"))) {
          deleteBlock(cu);
          ++deleted;
        }
      }
    }
    assertEquals(8, deleted);

    Iterator<ArticleFiles> it = nau.getArticleIterator();
    int count = 0;
    int countHtmlOnly = 0;
    int countPdfOnly = 0;
    while (it.hasNext()) {
      ArticleFiles af = it.next();
      log.info(af.toString());
      CachedUrl cu = af.getFullTextCu();
      String url = cu.getUrl();
      assertNotNull(cu);
      String contentType = cu.getContentType();
      log.debug("count " + count + " url " + url + " " + contentType);
      count++;
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == null) {
        ++countHtmlOnly;
      }
      if (af.getRoleUrl(ArticleFiles.ROLE_FULL_TEXT_PDF) == url) {
        ++countPdfOnly;
      }
    }
    log.debug("Article count is " + count);
    assertEquals(expCount, count);
    assertEquals(4, countHtmlOnly);
    assertEquals(4, countPdfOnly);
  }
 @Override
 public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException {
   ArticleMetadata am = super.extract(target, cu);
   am.cook(tagMap);
   String url = am.get(MetadataField.FIELD_ACCESS_URL);
   if (url != null && !url.isEmpty()) {
     CachedUrl val = cu.getArchivalUnit().makeCachedUrl(url);
     if (!val.hasContent()) {
       am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl());
     }
   } else {
     am.replace(MetadataField.FIELD_ACCESS_URL, cu.getUrl());
   }
   return am;
 }
Esempio n. 6
0
 protected void writeCu(CachedUrl cu) throws IOException {
   String url = cu.getUrl();
   long contentSize = cu.getContentSize();
   CIProperties props = cu.getProperties();
   long fetchTime = Long.parseLong(props.getProperty(CachedUrl.PROPERTY_FETCH_TIME));
   InputStream contentIn = cu.getUnfilteredInputStream();
   try {
     if (isResponse) {
       String hdrString = getHttpResponseString(cu);
       long size = contentSize + hdrString.length();
       InputStream headerIn = new ReaderInputStream(new StringReader(hdrString));
       InputStream concat = new SequenceInputStream(headerIn, contentIn);
       try {
         aw.write(xlateFilename(url), cu.getContentType(), getHostIp(), fetchTime, size, concat);
       } finally {
         IOUtil.safeClose(concat);
       }
     } else {
       aw.write(
           xlateFilename(url),
           cu.getContentType(),
           getHostIp(),
           fetchTime,
           cu.getContentSize(),
           contentIn);
     }
   } finally {
     AuUtil.safeRelease(cu);
   }
 }
 /**
  * Return the archive file type corresponding to the CU's MIME type or filename extension, or null
  * if none.
  */
 public String getFromCu(CachedUrl cu) throws MalformedURLException {
   String res = getFromMime(cu.getContentType());
   if (res == null) {
     res = getFromUrl(cu.getUrl());
   }
   return res;
 }
    @Override
    public ArticleMetadata extract(MetadataTarget target, CachedUrl cu) throws IOException {

      log.debug3("Metadata - cachedurl cu:" + cu.getUrl());

      ArticleMetadata am = super.extract(target, cu);
      am.cook(tagMap);
      return am;
    } // extract
    @Override
    protected ArticleFiles createArticleFiles(CachedUrl cu) {
      String url = cu.getUrl();
      Matcher mat = ABSTRACT_PATTERN.matcher(url);
      if (mat.find()) {
        return processAbstract(cu, mat);
      }

      log.warning("Mismatch between article iterator factory and article iterator: " + url);
      return null;
    }
 @Override
 protected ArticleFiles createArticleFiles(CachedUrl cu) {
   String url = cu.getUrl();
   log.debug3("iterating url: " + url);
   Matcher mat = PATTERN.matcher(url);
   if (mat.find()) {
     return processFullTextPdf(cu, mat);
   }
   log.warning("Mismatch between article iterator factory and article iterator: " + url);
   return null;
 }
 @Override
 public void extract(MetadataTarget target, CachedUrl cu, Emitter emitter) throws IOException {
   ArticleMetadata am = new SimpleHtmlMetaTagMetadataExtractor().extract(target, cu);
   am.cook(tagMap);
   String url = am.get(MetadataField.FIELD_ACCESS_URL);
   ArchivalUnit au = cu.getArchivalUnit();
   if (url == null || url.isEmpty() || !au.makeCachedUrl(url).hasContent()) {
     url = cu.getUrl();
   }
   am.replace(
       MetadataField.FIELD_ACCESS_URL,
       HttpToHttpsUtil.AuUtil.normalizeHttpHttpsFromBaseUrl(au, url));
   emitter.emitMetadata(cu, am);
 }
Esempio n. 12
0
 private void writeFiles() {
   PlatformUtil platutil = PlatformUtil.getInstance();
   CuIterator iter = AuUtil.getCuIterator(au);
   int errs = 0;
   CachedUrl curCu = null;
   CachedUrl nextCu = getNextCu(iter);
   while (nextCu != null) {
     curCu = nextCu;
     nextCu = getNextCu(iter);
     if (excludeDirNodes && nextCu != null && isDirOf(curCu, nextCu)) {
       continue;
     }
     CachedUrl[] cuVersions =
         curCu.getCuVersions(maxVersions > 0 ? maxVersions : Integer.MAX_VALUE);
     for (CachedUrl cu : cuVersions) {
       try {
         log.debug2("Exporting " + cu.getUrl());
         writeCu(cu);
       } catch (IOException e) {
         if (platutil.isDiskFullError(e)) {
           recordError("Disk full, can't write export file.");
           isDiskFull = true;
           return;
         }
       } catch (Exception e) {
         // XXX Would like to differentiate between errors opening or
         // reading CU, which shouldn't cause abort, and errors writing
         // to export file, which should.
         recordError("Unable to copy " + cu.getUrl(), e);
         if (errs++ >= maxErrors) {
           recordError("Aborting after " + errs + " errors");
           return;
         }
       }
     }
   }
 }
 /*
  * This is comlicated. MOST AUs have articles that live below and issue level TOC
  * that is,
  * <blah>/<journal_id>/vol#/iss#/ is a toc with no relevant metadata
  * <blah>/<journal_id>/vol#/iss#/xxx is an article with metadata
  * (eg Economist Voice V1)
  * BUT
  * in some AUs there are issues with only 1 article, in which case
  * <blah>/<journal_id>/vol#/iss#/ is an abstract with metadata
  * (eg Rhodes Cook V4)
  * and a few AUs with a mixture
  * (eg Forum for Health Economics V5)
  * So to identify ALL articles, we'll also have to capture issue level items and then look
  * at the html and if it has article metadata in it, count it as an article.
  *
  */
 @Override
 protected ArticleFiles createArticleFiles(CachedUrl cu) {
   String url = cu.getUrl();
   Matcher mat = pattern.matcher(url);
   if (mat.find()) {
     // we matched, but could this pattern potentially be a toc?
     Matcher tocmat = TOC_pattern.matcher(url);
     // if we could be a TOC then we must have metadata to be considered an article
     if (tocmat.find()) {
       if (hasArticleMetadata(cu)) {
         return processUrl(cu, mat);
       }
     } else {
       // we're not a potential TOC, so treat this as an article without checking
       return processUrl(cu, mat);
     }
     return null; // this was a TOC, not an article
   }
   log.warning("Mismatch between article iterator factory and article iterator: " + url);
   return null;
 }