@Override public void beginSubField(Field designator) { if (designator == null) { return; } try { AttributesImpl attrs = new AttributesImpl(); String subfieldId = designator.subfieldId(); if (subfieldId == null || subfieldId.length() == 0) { subfieldId = "a"; // fallback } attrs.addAttribute(nsUri, CODE, CODE, "CDATA", subfieldId); if (contentHandler != null) { contentHandler.startElement(nsUri, SUBFIELD, SUBFIELD, attrs); } if (listener != null) { listener.beginSubField(designator); } } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } }
@Override public void endSubField(Field designator) { if (designator == null) { return; } try { if (listener != null) { listener.endSubField(designator); } if (designator != null) { if (contentHandler != null) { String value = designator.data(); if (!value.isEmpty()) { value = normalizeValue(value); contentHandler.characters(value.toCharArray(), 0, value.length()); } } } if (contentHandler != null) { contentHandler.endElement(NS_URI, SUBFIELD, SUBFIELD); } } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } }
@Override public void endRecord() { if (!recordOpen) { return; } try { if (listener != null) { listener.endRecord(); } if (contentHandler != null) { contentHandler.endElement(nsUri, RECORD, RECORD); } if (listener != null) { // emit trailer event, drives record output segmentation listener.trailer(null); } this.recordOpen = false; } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } }
@Override public void endDataField(Field designator) { try { if (!datafieldOpen) { return; } if (listener != null) { listener.endDataField(designator); } if (designator != null) { String value = designator.data(); if (value != null && !value.isEmpty()) { value = normalizeValue(value); // write data field per default into a subfield with code 'a' AttributesImpl attrs = new AttributesImpl(); attrs.addAttribute(nsUri, CODE, CODE, "CDATA", "a"); if (contentHandler != null) { contentHandler.startElement(nsUri, SUBFIELD, SUBFIELD, attrs); contentHandler.characters(value.toCharArray(), 0, value.length()); contentHandler.endElement(nsUri, SUBFIELD, SUBFIELD); } } } if (contentHandler != null) { contentHandler.endElement(NS_URI, DATAFIELD, DATAFIELD); } datafieldOpen = false; } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } }
@Override public void beginRecord(String format, String type) { if (recordOpen) { return; } try { AttributesImpl attrs = new AttributesImpl(); if (format != null && !"MARC21".equalsIgnoreCase(schema)) { attrs.addAttribute(nsUri, FORMAT, FORMAT, "CDATA", format); } if (type != null) { attrs.addAttribute(nsUri, TYPE, TYPE, "CDATA", type); } if (contentHandler != null) { contentHandler.startElement(nsUri, RECORD, RECORD, attrs); } if (listener != null) { listener.beginRecord(format, type); } this.recordOpen = true; } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } }
public void beginCollection() throws SAXException { if (contentHandler == null) { logger.warn("no content handler set"); return; } contentHandler.startDocument(); // write schema info AttributesImpl attrs = new AttributesImpl(); if ("MARC21".equalsIgnoreCase(schema)) { this.nsUri = MARC21_NS_URI; attrs.addAttribute( XMLNS.NS_URI, XSI.NS_PREFIX, XMLNS.NS_PREFIX + ":" + XSI.NS_PREFIX, "CDATA", XSI.NS_URI); attrs.addAttribute( XSI.NS_URI, "schemaLocation", XSI.NS_PREFIX + ":schemaLocation", "CDATA", MARC21_NS_URI + " " + MARC21_SCHEMA); } else { this.nsUri = NS_URI; attrs.addAttribute( XMLNS.NS_URI, XSI.NS_PREFIX, XMLNS.NS_PREFIX + ":" + XSI.NS_PREFIX, "CDATA", XSI.NS_URI); attrs.addAttribute( XSI.NS_URI, "schemaLocation", XSI.NS_PREFIX + ":schemaLocation", "CDATA", NS_URI + " " + MARCXCHANGE_SCHEMA); } contentHandler.startPrefixMapping("", nsUri); contentHandler.startElement(nsUri, COLLECTION, COLLECTION, attrs); }
public void endCollection() throws SAXException { if (contentHandler == null) { logger.warn("no content handler set"); return; } contentHandler.endElement(nsUri, COLLECTION, COLLECTION); contentHandler.endDocument(); }
@Override public void beginDataField(Field designator) { if (designator == null) { return; } try { if (designator.isControlField()) { beginControlField(designator); endControlField(designator); return; } if (datafieldOpen) { return; } AttributesImpl attrs = new AttributesImpl(); String tag = designator.tag(); if (tag == null || tag.length() == 0) { tag = Field.NULL_TAG; // fallback designator.tag(tag); } attrs.addAttribute(nsUri, TAG, TAG, "CDATA", tag); int ind = designator.indicator() != null ? designator.indicator().length() : 0; // force at least two default blank indicators if schema is Marc 21 if ("MARC21".equalsIgnoreCase(schema)) { for (int i = (ind == 0 ? 1 : ind); i <= 2; i++) { attrs.addAttribute(null, IND + i, IND + i, "CDATA", " "); } } // set indicators for (int i = 1; i <= ind; i++) { attrs.addAttribute( null, IND + i, IND + i, "CDATA", designator.indicator().substring(i - 1, i)); } if (contentHandler != null) { contentHandler.startElement(nsUri, DATAFIELD, DATAFIELD, attrs); } if (listener != null) { listener.beginDataField(designator); } datafieldOpen = true; } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } }
@Override public AtomicLong next() { if (done) { return fileCounter; } try { URI uri = input.poll(); if (uri != null) { push(uri); } else { done = true; } fileCounter.incrementAndGet(); } catch (Exception e) { logger.error(e.getMessage(), e); done = true; } return fileCounter; }
@Override public void beginControlField(Field designator) { if (designator == null) { return; } try { AttributesImpl attrs = new AttributesImpl(); attrs.addAttribute(nsUri, TAG, TAG, "CDATA", designator.tag()); if (contentHandler != null) { contentHandler.startElement(nsUri, CONTROLFIELD, CONTROLFIELD, attrs); } if (listener != null) { listener.beginControlField(designator); } } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } }
@Override public void leader(String value) { if (value == null) { return; } try { if (contentHandler != null) { contentHandler.startElement(nsUri, LEADER, LEADER, EMPTY_ATTRIBUTES); contentHandler.characters(value.toCharArray(), 0, value.length()); contentHandler.endElement(nsUri, LEADER, LEADER); } if (listener != null) { listener.leader(value); } } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } }
@Override public void endControlField(Field designator) { try { if (listener != null) { listener.endControlField(designator); } if (designator != null) { String value = designator.data(); if (!value.isEmpty()) { switch (designator.tag()) { case "001": this.id = value; break; case "006": case "007": case "008": // fix fill characters here value = value.replace('^', '|'); break; } if (contentHandler != null) { contentHandler.characters(value.toCharArray(), 0, value.length()); } } } if (contentHandler != null) { contentHandler.endElement(nsUri, CONTROLFIELD, CONTROLFIELD); } } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } }
public static void main(String[] args) { int exitcode = 0; try { OptionParser parser = new OptionParser() { { accepts("elasticsearch").withRequiredArg().ofType(String.class).required(); accepts("index").withRequiredArg().ofType(String.class).required(); accepts("type").withRequiredArg().ofType(String.class).required(); accepts("maxbulkactions").withRequiredArg().ofType(Integer.class).defaultsTo(1000); accepts("maxconcurrentbulkrequests") .withRequiredArg() .ofType(Integer.class) .defaultsTo(4 * Runtime.getRuntime().availableProcessors()); accepts("mock").withOptionalArg().ofType(Boolean.class).defaultsTo(Boolean.FALSE); accepts("path").withRequiredArg().ofType(String.class).required(); accepts("pattern") .withRequiredArg() .ofType(String.class) .required() .defaultsTo("*.txt"); accepts("threads").withRequiredArg().ofType(Integer.class).defaultsTo(1); accepts("help"); } }; final OptionSet options = parser.parse(args); if (options.hasArgument("help")) { System.err.println( "Help for " + Medline.class.getCanonicalName() + lf + " --help print this help message" + lf + " --elasticsearch <uri> Elasticesearch URI" + lf + " --index <index> Elasticsearch index name" + lf + " --type <type> Elasticsearch type name" + lf + " --maxbulkactions <n> the number of bulk actions per request (optional, default: 1000)" + " --maxconcurrentbulkrequests <n>the number of concurrent bulk requests (optional, default: 4 * cpu cores)" + " --path <path> a file path from where the input files are recursively collected (required)" + lf + " --pattern <pattern> a regex for selecting matching file names for input (default: *.txt)" + lf + " --threads <n> the number of threads (optional, default: <num-of=cpus)"); System.exit(1); } input = new Finder((String) options.valueOf("pattern")) .find((String) options.valueOf("path")) .getURIs(); final Integer threads = (Integer) options.valueOf("threads"); logger.info("found {} input files", input.size()); URI esURI = URI.create((String) options.valueOf("elasticsearch")); index = (String) options.valueOf("index"); type = (String) options.valueOf("type"); int maxbulkactions = (Integer) options.valueOf("maxbulkactions"); int maxconcurrentbulkrequests = (Integer) options.valueOf("maxconcurrentbulkrequests"); boolean mock = (Boolean) options.valueOf("mock"); final IngestClient es = mock ? new MockIngestClient() : new IngestClient(); es.maxBulkActions(maxbulkactions) .maxConcurrentBulkRequests(maxconcurrentbulkrequests) .newClient(esURI) .waitForCluster(ClusterHealthStatus.YELLOW, TimeValue.timeValueSeconds(30)); logger.info("creating new index ..."); es.setIndex(index).setType(type).newIndex(); logger.info("... new index created"); final ResourceSink sink = new ResourceSink(es); ImportService service = new ImportService() .threads(threads) .factory( new ImporterFactory() { @Override public Importer newImporter() { return new SpringerCitations(sink); } }) .execute(); logger.info( "finished, number of files = {}, resources indexed = {}", fileCounter, sink.getCounter()); service.shutdown(); logger.info("service shutdown"); es.shutdown(); logger.info("elasticsearch client shutdown"); } catch (IOException | InterruptedException | ExecutionException e) { logger.error(e.getMessage(), e); exitcode = 1; } System.exit(exitcode); }
private void push(URI uri) throws Exception { if (uri == null) { return; } InputStream in = factory.open(uri); if (in == null) { throw new IOException("unable to open " + uri); } try (BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8"))) { String title = null; List<String> author = new LinkedList(); String year = null; String journal = null; String issn = null; String volume = null; String issue = null; String pagination = null; String doi = null; String publisher = null; String line; while ((line = reader.readLine()) != null) { if (line.isEmpty()) { continue; } if ('%' != line.charAt(0)) { continue; } char ch = line.charAt(1); switch (ch) { case 'D': { year = line.substring(3).trim(); break; } case 'T': { title = line.substring(3).trim(); break; } case '@': { issn = line.substring(3).trim(); break; } case 'J': { journal = line.substring(3).trim(); break; } case 'A': { author.add(line.substring(3).trim()); break; } case 'V': { volume = line.substring(3).trim(); break; } case 'N': { issue = line.substring(3).trim(); break; } case 'P': { pagination = line.substring(3).trim(); break; } case 'R': { doi = line.substring(3).trim(); break; } case 'I': { publisher = line.substring(3).trim(); break; } case 'U': { // URL (DOI resolver) break; } case 'K': { // keywords break; } case '0': { // record type break; } case '8': { // day break; } case 'G': { // language break; } default: { logger.warn("unknown tag: " + line); } } } // create bibliographic key String key = author.isEmpty() ? null : new WorkAuthor().authorName(author.get(0)).workName(title).createIdentifier(); IRI dereferencable = IRI.builder().scheme("http").host("xbib.info").path("/doi/").fragment(doi).build(); Resource r = resourceContext .newResource() .id(dereferencable) .a(FABIO_ARTICLE) .add("xbib:key", key) .add("prism:doi", doi) .add("dc:title", title); for (String a : author) { r.add("dc:creator", a); } r.add("prism:publicationDate", new SimpleLiteral<>(year).type(Literal.GYEAR)); r.newResource(FRBR_EMBODIMENT).a(FABIO_PERIODICAL_VOLUME).add("prism:volume", volume); r.newResource(FRBR_EMBODIMENT).a(FABIO_PERIODICAL_ISSUE).add("prism:number", issue); r.newResource(FRBR_EMBODIMENT).a(FABIO_PRINT_OBJECT).add("prism:pageRange", pagination); r.newResource(FRBR_PARTOF) .a(FABIO_JOURNAL) .add("prism:publicationName", journal) .add("prism:issn", issn) .add("dc:publisher", publisher); resourceContext .resource() .id( IRI.builder() .scheme("http") .host(index) .query(type) .fragment(resourceContext.resource().id().getFragment()) .build()); out.output(resourceContext, resourceContext.contentBuilder()); } }