@Override
 public void beginSubField(Field designator) {
   if (designator == null) {
     return;
   }
   try {
     AttributesImpl attrs = new AttributesImpl();
     String subfieldId = designator.subfieldId();
     if (subfieldId == null || subfieldId.length() == 0) {
       subfieldId = "a"; // fallback
     }
     attrs.addAttribute(nsUri, CODE, CODE, "CDATA", subfieldId);
     if (contentHandler != null) {
       contentHandler.startElement(nsUri, SUBFIELD, SUBFIELD, attrs);
     }
     if (listener != null) {
       listener.beginSubField(designator);
     }
   } catch (Exception ex) {
     if (fatalerrors) {
       throw new RuntimeException(ex);
     } else if (!silenterrors) {
       logger.warn(designator + ": " + ex.getMessage(), ex);
     }
   }
 }
 @Override
 public void endSubField(Field designator) {
   if (designator == null) {
     return;
   }
   try {
     if (listener != null) {
       listener.endSubField(designator);
     }
     if (designator != null) {
       if (contentHandler != null) {
         String value = designator.data();
         if (!value.isEmpty()) {
           value = normalizeValue(value);
           contentHandler.characters(value.toCharArray(), 0, value.length());
         }
       }
     }
     if (contentHandler != null) {
       contentHandler.endElement(NS_URI, SUBFIELD, SUBFIELD);
     }
   } catch (Exception ex) {
     if (fatalerrors) {
       throw new RuntimeException(ex);
     } else if (!silenterrors) {
       logger.warn(designator + ": " + ex.getMessage(), ex);
     }
   }
 }
 @Override
 public void endRecord() {
   if (!recordOpen) {
     return;
   }
   try {
     if (listener != null) {
       listener.endRecord();
     }
     if (contentHandler != null) {
       contentHandler.endElement(nsUri, RECORD, RECORD);
     }
     if (listener != null) {
       // emit trailer event, drives record output segmentation
       listener.trailer(null);
     }
     this.recordOpen = false;
   } catch (Exception ex) {
     if (fatalerrors) {
       throw new RuntimeException(ex);
     } else if (!silenterrors) {
       logger.warn(designator + ": " + ex.getMessage(), ex);
     }
   }
 }
 @Override
 public void endDataField(Field designator) {
   try {
     if (!datafieldOpen) {
       return;
     }
     if (listener != null) {
       listener.endDataField(designator);
     }
     if (designator != null) {
       String value = designator.data();
       if (value != null && !value.isEmpty()) {
         value = normalizeValue(value);
         // write data field per default into a subfield with code 'a'
         AttributesImpl attrs = new AttributesImpl();
         attrs.addAttribute(nsUri, CODE, CODE, "CDATA", "a");
         if (contentHandler != null) {
           contentHandler.startElement(nsUri, SUBFIELD, SUBFIELD, attrs);
           contentHandler.characters(value.toCharArray(), 0, value.length());
           contentHandler.endElement(nsUri, SUBFIELD, SUBFIELD);
         }
       }
     }
     if (contentHandler != null) {
       contentHandler.endElement(NS_URI, DATAFIELD, DATAFIELD);
     }
     datafieldOpen = false;
   } catch (Exception ex) {
     if (fatalerrors) {
       throw new RuntimeException(ex);
     } else if (!silenterrors) {
       logger.warn(designator + ": " + ex.getMessage(), ex);
     }
   }
 }
 @Override
 public void beginRecord(String format, String type) {
   if (recordOpen) {
     return;
   }
   try {
     AttributesImpl attrs = new AttributesImpl();
     if (format != null && !"MARC21".equalsIgnoreCase(schema)) {
       attrs.addAttribute(nsUri, FORMAT, FORMAT, "CDATA", format);
     }
     if (type != null) {
       attrs.addAttribute(nsUri, TYPE, TYPE, "CDATA", type);
     }
     if (contentHandler != null) {
       contentHandler.startElement(nsUri, RECORD, RECORD, attrs);
     }
     if (listener != null) {
       listener.beginRecord(format, type);
     }
     this.recordOpen = true;
   } catch (Exception ex) {
     if (fatalerrors) {
       throw new RuntimeException(ex);
     } else if (!silenterrors) {
       logger.warn(designator + ": " + ex.getMessage(), ex);
     }
   }
 }
  public void beginCollection() throws SAXException {
    if (contentHandler == null) {
      logger.warn("no content handler set");
      return;
    }
    contentHandler.startDocument();
    // write schema info
    AttributesImpl attrs = new AttributesImpl();
    if ("MARC21".equalsIgnoreCase(schema)) {
      this.nsUri = MARC21_NS_URI;
      attrs.addAttribute(
          XMLNS.NS_URI, XSI.NS_PREFIX, XMLNS.NS_PREFIX + ":" + XSI.NS_PREFIX, "CDATA", XSI.NS_URI);
      attrs.addAttribute(
          XSI.NS_URI,
          "schemaLocation",
          XSI.NS_PREFIX + ":schemaLocation",
          "CDATA",
          MARC21_NS_URI + " " + MARC21_SCHEMA);

    } else {
      this.nsUri = NS_URI;
      attrs.addAttribute(
          XMLNS.NS_URI, XSI.NS_PREFIX, XMLNS.NS_PREFIX + ":" + XSI.NS_PREFIX, "CDATA", XSI.NS_URI);
      attrs.addAttribute(
          XSI.NS_URI,
          "schemaLocation",
          XSI.NS_PREFIX + ":schemaLocation",
          "CDATA",
          NS_URI + " " + MARCXCHANGE_SCHEMA);
    }
    contentHandler.startPrefixMapping("", nsUri);
    contentHandler.startElement(nsUri, COLLECTION, COLLECTION, attrs);
  }
 public void endCollection() throws SAXException {
   if (contentHandler == null) {
     logger.warn("no content handler set");
     return;
   }
   contentHandler.endElement(nsUri, COLLECTION, COLLECTION);
   contentHandler.endDocument();
 }
 @Override
 public void beginDataField(Field designator) {
   if (designator == null) {
     return;
   }
   try {
     if (designator.isControlField()) {
       beginControlField(designator);
       endControlField(designator);
       return;
     }
     if (datafieldOpen) {
       return;
     }
     AttributesImpl attrs = new AttributesImpl();
     String tag = designator.tag();
     if (tag == null || tag.length() == 0) {
       tag = Field.NULL_TAG; // fallback
       designator.tag(tag);
     }
     attrs.addAttribute(nsUri, TAG, TAG, "CDATA", tag);
     int ind = designator.indicator() != null ? designator.indicator().length() : 0;
     // force at least two default blank indicators if schema is Marc 21
     if ("MARC21".equalsIgnoreCase(schema)) {
       for (int i = (ind == 0 ? 1 : ind); i <= 2; i++) {
         attrs.addAttribute(null, IND + i, IND + i, "CDATA", " ");
       }
     }
     // set indicators
     for (int i = 1; i <= ind; i++) {
       attrs.addAttribute(
           null, IND + i, IND + i, "CDATA", designator.indicator().substring(i - 1, i));
     }
     if (contentHandler != null) {
       contentHandler.startElement(nsUri, DATAFIELD, DATAFIELD, attrs);
     }
     if (listener != null) {
       listener.beginDataField(designator);
     }
     datafieldOpen = true;
   } catch (Exception ex) {
     if (fatalerrors) {
       throw new RuntimeException(ex);
     } else if (!silenterrors) {
       logger.warn(designator + ": " + ex.getMessage(), ex);
     }
   }
 }
示例#9
0
 @Override
 public AtomicLong next() {
   if (done) {
     return fileCounter;
   }
   try {
     URI uri = input.poll();
     if (uri != null) {
       push(uri);
     } else {
       done = true;
     }
     fileCounter.incrementAndGet();
   } catch (Exception e) {
     logger.error(e.getMessage(), e);
     done = true;
   }
   return fileCounter;
 }
示例#10
0
 @Override
 public void beginControlField(Field designator) {
   if (designator == null) {
     return;
   }
   try {
     AttributesImpl attrs = new AttributesImpl();
     attrs.addAttribute(nsUri, TAG, TAG, "CDATA", designator.tag());
     if (contentHandler != null) {
       contentHandler.startElement(nsUri, CONTROLFIELD, CONTROLFIELD, attrs);
     }
     if (listener != null) {
       listener.beginControlField(designator);
     }
   } catch (Exception ex) {
     if (fatalerrors) {
       throw new RuntimeException(ex);
     } else if (!silenterrors) {
       logger.warn(designator + ": " + ex.getMessage(), ex);
     }
   }
 }
示例#11
0
 @Override
 public void leader(String value) {
   if (value == null) {
     return;
   }
   try {
     if (contentHandler != null) {
       contentHandler.startElement(nsUri, LEADER, LEADER, EMPTY_ATTRIBUTES);
       contentHandler.characters(value.toCharArray(), 0, value.length());
       contentHandler.endElement(nsUri, LEADER, LEADER);
     }
     if (listener != null) {
       listener.leader(value);
     }
   } catch (Exception ex) {
     if (fatalerrors) {
       throw new RuntimeException(ex);
     } else if (!silenterrors) {
       logger.warn(designator + ": " + ex.getMessage(), ex);
     }
   }
 }
示例#12
0
 @Override
 public void endControlField(Field designator) {
   try {
     if (listener != null) {
       listener.endControlField(designator);
     }
     if (designator != null) {
       String value = designator.data();
       if (!value.isEmpty()) {
         switch (designator.tag()) {
           case "001":
             this.id = value;
             break;
           case "006":
           case "007":
           case "008":
             // fix fill characters here
             value = value.replace('^', '|');
             break;
         }
         if (contentHandler != null) {
           contentHandler.characters(value.toCharArray(), 0, value.length());
         }
       }
     }
     if (contentHandler != null) {
       contentHandler.endElement(nsUri, CONTROLFIELD, CONTROLFIELD);
     }
   } catch (Exception ex) {
     if (fatalerrors) {
       throw new RuntimeException(ex);
     } else if (!silenterrors) {
       logger.warn(designator + ": " + ex.getMessage(), ex);
     }
   }
 }
示例#13
0
  public static void main(String[] args) {
    int exitcode = 0;
    try {
      OptionParser parser =
          new OptionParser() {
            {
              accepts("elasticsearch").withRequiredArg().ofType(String.class).required();
              accepts("index").withRequiredArg().ofType(String.class).required();
              accepts("type").withRequiredArg().ofType(String.class).required();
              accepts("maxbulkactions").withRequiredArg().ofType(Integer.class).defaultsTo(1000);
              accepts("maxconcurrentbulkrequests")
                  .withRequiredArg()
                  .ofType(Integer.class)
                  .defaultsTo(4 * Runtime.getRuntime().availableProcessors());
              accepts("mock").withOptionalArg().ofType(Boolean.class).defaultsTo(Boolean.FALSE);
              accepts("path").withRequiredArg().ofType(String.class).required();
              accepts("pattern")
                  .withRequiredArg()
                  .ofType(String.class)
                  .required()
                  .defaultsTo("*.txt");
              accepts("threads").withRequiredArg().ofType(Integer.class).defaultsTo(1);
              accepts("help");
            }
          };
      final OptionSet options = parser.parse(args);
      if (options.hasArgument("help")) {
        System.err.println(
            "Help for "
                + Medline.class.getCanonicalName()
                + lf
                + " --help                 print this help message"
                + lf
                + " --elasticsearch <uri>  Elasticesearch URI"
                + lf
                + " --index <index>        Elasticsearch index name"
                + lf
                + " --type <type>          Elasticsearch type name"
                + lf
                + " --maxbulkactions <n>   the number of bulk actions per request (optional, default: 1000)"
                + " --maxconcurrentbulkrequests <n>the number of concurrent bulk requests (optional, default: 4 * cpu cores)"
                + " --path <path>          a file path from where the input files are recursively collected (required)"
                + lf
                + " --pattern <pattern>    a regex for selecting matching file names for input (default: *.txt)"
                + lf
                + " --threads <n>          the number of threads (optional, default: <num-of=cpus)");
        System.exit(1);
      }
      input =
          new Finder((String) options.valueOf("pattern"))
              .find((String) options.valueOf("path"))
              .getURIs();
      final Integer threads = (Integer) options.valueOf("threads");

      logger.info("found {} input files", input.size());

      URI esURI = URI.create((String) options.valueOf("elasticsearch"));
      index = (String) options.valueOf("index");
      type = (String) options.valueOf("type");
      int maxbulkactions = (Integer) options.valueOf("maxbulkactions");
      int maxconcurrentbulkrequests = (Integer) options.valueOf("maxconcurrentbulkrequests");
      boolean mock = (Boolean) options.valueOf("mock");

      final IngestClient es = mock ? new MockIngestClient() : new IngestClient();

      es.maxBulkActions(maxbulkactions)
          .maxConcurrentBulkRequests(maxconcurrentbulkrequests)
          .newClient(esURI)
          .waitForCluster(ClusterHealthStatus.YELLOW, TimeValue.timeValueSeconds(30));

      logger.info("creating new index ...");
      es.setIndex(index).setType(type).newIndex();
      logger.info("... new index created");

      final ResourceSink sink = new ResourceSink(es);

      ImportService service =
          new ImportService()
              .threads(threads)
              .factory(
                  new ImporterFactory() {
                    @Override
                    public Importer newImporter() {
                      return new SpringerCitations(sink);
                    }
                  })
              .execute();

      logger.info(
          "finished, number of files = {}, resources indexed = {}", fileCounter, sink.getCounter());

      service.shutdown();
      logger.info("service shutdown");

      es.shutdown();
      logger.info("elasticsearch client shutdown");

    } catch (IOException | InterruptedException | ExecutionException e) {
      logger.error(e.getMessage(), e);
      exitcode = 1;
    }
    System.exit(exitcode);
  }
示例#14
0
  private void push(URI uri) throws Exception {
    if (uri == null) {
      return;
    }
    InputStream in = factory.open(uri);
    if (in == null) {
      throw new IOException("unable to open " + uri);
    }
    try (BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8"))) {
      String title = null;
      List<String> author = new LinkedList();
      String year = null;
      String journal = null;
      String issn = null;
      String volume = null;
      String issue = null;
      String pagination = null;
      String doi = null;
      String publisher = null;
      String line;
      while ((line = reader.readLine()) != null) {
        if (line.isEmpty()) {
          continue;
        }
        if ('%' != line.charAt(0)) {
          continue;
        }
        char ch = line.charAt(1);
        switch (ch) {
          case 'D':
            {
              year = line.substring(3).trim();
              break;
            }
          case 'T':
            {
              title = line.substring(3).trim();
              break;
            }
          case '@':
            {
              issn = line.substring(3).trim();
              break;
            }
          case 'J':
            {
              journal = line.substring(3).trim();
              break;
            }
          case 'A':
            {
              author.add(line.substring(3).trim());
              break;
            }
          case 'V':
            {
              volume = line.substring(3).trim();
              break;
            }
          case 'N':
            {
              issue = line.substring(3).trim();
              break;
            }
          case 'P':
            {
              pagination = line.substring(3).trim();
              break;
            }
          case 'R':
            {
              doi = line.substring(3).trim();
              break;
            }
          case 'I':
            {
              publisher = line.substring(3).trim();
              break;
            }
          case 'U':
            {
              // URL (DOI resolver)
              break;
            }
          case 'K':
            {
              // keywords
              break;
            }
          case '0':
            {
              // record type
              break;
            }
          case '8':
            {
              // day
              break;
            }
          case 'G':
            {
              // language
              break;
            }
          default:
            {
              logger.warn("unknown tag: " + line);
            }
        }
      }
      // create bibliographic key

      String key =
          author.isEmpty()
              ? null
              : new WorkAuthor().authorName(author.get(0)).workName(title).createIdentifier();

      IRI dereferencable =
          IRI.builder().scheme("http").host("xbib.info").path("/doi/").fragment(doi).build();

      Resource r =
          resourceContext
              .newResource()
              .id(dereferencable)
              .a(FABIO_ARTICLE)
              .add("xbib:key", key)
              .add("prism:doi", doi)
              .add("dc:title", title);
      for (String a : author) {
        r.add("dc:creator", a);
      }
      r.add("prism:publicationDate", new SimpleLiteral<>(year).type(Literal.GYEAR));
      r.newResource(FRBR_EMBODIMENT).a(FABIO_PERIODICAL_VOLUME).add("prism:volume", volume);
      r.newResource(FRBR_EMBODIMENT).a(FABIO_PERIODICAL_ISSUE).add("prism:number", issue);
      r.newResource(FRBR_EMBODIMENT).a(FABIO_PRINT_OBJECT).add("prism:pageRange", pagination);
      r.newResource(FRBR_PARTOF)
          .a(FABIO_JOURNAL)
          .add("prism:publicationName", journal)
          .add("prism:issn", issn)
          .add("dc:publisher", publisher);
      resourceContext
          .resource()
          .id(
              IRI.builder()
                  .scheme("http")
                  .host(index)
                  .query(type)
                  .fragment(resourceContext.resource().id().getFragment())
                  .build());
      out.output(resourceContext, resourceContext.contentBuilder());
    }
  }