public class SearchService { private static final Logger logger = LoggerFactory.getLogger(SearchService.class.getName()); @Context ServletConfig servletConfig; @POST @Produces({"application/xhtml+xml; charset=UTF-8"}) public StreamingOutput postXHTML( @QueryParam("q") final String query, @QueryParam("from") final int from, @QueryParam("size") final int size, @QueryParam("service") final String service) throws Exception { return new StreamingOutput() { @Override public void write(OutputStream output) throws IOException, WebApplicationException { try { ZClient client = ZClientFactory.newZClient(service); ZSearchRetrieveRequest request = client.newCQLSearchRetrieveRequest().setQuery(query).setFrom(from).setSize(size); ZSearchRetrieveResponse response = request.execute(); StylesheetTransformer transformer = new StylesheetTransformer("xsl"); response .setStylesheetTransformer(transformer) .setOutputFormat(OutputFormat.XHTML) .to(new OutputStreamWriter(output, "UTF-8")); client.close(); } catch (Diagnostics d) { logger.error(d.getMessage(), d); throw new IOException(d); } catch (IOException e) { logger.error(e.getMessage(), e); throw new IOException(e); } } }; } }
public class MarcXmlReaderTest { private static final Logger logger = LoggerFactory.getLogger(MarcXmlReaderTest.class.getName()); @Test public void testMarcXMLFromOAI() throws Exception { InputStream in = getClass().getResourceAsStream("zdb-oai-marc.xml"); if (in == null) { throw new IOException("input stream not found"); } InputSource source = new InputSource(new InputStreamReader(in, "UTF-8")); MarcXmlReader reader = new MarcXmlReader(source); reader.setListener( new MarcXchangeListener() { @Override public void leader(String label) { logger.debug("leader=" + label); } @Override public void beginRecord(String format, String type) { logger.debug("beginRecord format=" + format + " type=" + type); } @Override public void beginControlField(Field field) { logger.debug("beginControlField field=" + field); } @Override public void endControlField(Field field) { logger.debug("endControlField field=" + field); } @Override public void beginDataField(Field field) { logger.debug("beginDataField field=" + field); } @Override public void endDataField(Field field) { logger.debug("endDataField field=" + field); } @Override public void beginSubField(Field field) { logger.debug("beginSubField field=" + field); } @Override public void endSubField(Field field) { logger.debug("endsubField field=" + field); } @Override public void endRecord() { logger.debug("endRecord"); } @Override public void trailer(String trailer) { logger.debug("trailer " + trailer); } }); reader.parse(); } }
/** Push Springer citations to Elasticsearch */ public class SpringerCitations extends AbstractImporter<Long, AtomicLong> { private static final Logger logger = LoggerFactory.getLogger(SpringerCitations.class.getName()); private static final String lf = System.getProperty("line.separator"); private static Queue<URI> input; private static final AtomicLong fileCounter = new AtomicLong(0L); private final SimpleResourceContext resourceContext = new SimpleResourceContext(); private static String index; private static String type; private ElementOutput out; private boolean done = false; public static void main(String[] args) { int exitcode = 0; try { OptionParser parser = new OptionParser() { { accepts("elasticsearch").withRequiredArg().ofType(String.class).required(); accepts("index").withRequiredArg().ofType(String.class).required(); accepts("type").withRequiredArg().ofType(String.class).required(); accepts("maxbulkactions").withRequiredArg().ofType(Integer.class).defaultsTo(1000); accepts("maxconcurrentbulkrequests") .withRequiredArg() .ofType(Integer.class) .defaultsTo(4 * Runtime.getRuntime().availableProcessors()); accepts("mock").withOptionalArg().ofType(Boolean.class).defaultsTo(Boolean.FALSE); accepts("path").withRequiredArg().ofType(String.class).required(); accepts("pattern") .withRequiredArg() .ofType(String.class) .required() .defaultsTo("*.txt"); accepts("threads").withRequiredArg().ofType(Integer.class).defaultsTo(1); accepts("help"); } }; final OptionSet options = parser.parse(args); if (options.hasArgument("help")) { System.err.println( "Help for " + Medline.class.getCanonicalName() + lf + " --help print this help message" + lf + " --elasticsearch <uri> Elasticesearch URI" + lf + " --index <index> Elasticsearch index name" + lf + " --type <type> Elasticsearch type name" + lf + " --maxbulkactions <n> the number of bulk actions per request (optional, default: 1000)" + " --maxconcurrentbulkrequests <n>the number of concurrent bulk requests (optional, default: 4 * cpu cores)" + " --path <path> a file path from where the input files are recursively collected (required)" + lf + " --pattern <pattern> a regex for selecting matching file names for input (default: *.txt)" + lf + " --threads <n> the number of threads (optional, default: <num-of=cpus)"); System.exit(1); } input = new Finder((String) options.valueOf("pattern")) .find((String) options.valueOf("path")) .getURIs(); final Integer threads = (Integer) options.valueOf("threads"); logger.info("found {} input files", input.size()); URI esURI = URI.create((String) options.valueOf("elasticsearch")); index = (String) options.valueOf("index"); type = (String) options.valueOf("type"); int maxbulkactions = (Integer) options.valueOf("maxbulkactions"); int maxconcurrentbulkrequests = (Integer) options.valueOf("maxconcurrentbulkrequests"); boolean mock = (Boolean) options.valueOf("mock"); final IngestClient es = mock ? new MockIngestClient() : new IngestClient(); es.maxBulkActions(maxbulkactions) .maxConcurrentBulkRequests(maxconcurrentbulkrequests) .newClient(esURI) .waitForCluster(ClusterHealthStatus.YELLOW, TimeValue.timeValueSeconds(30)); logger.info("creating new index ..."); es.setIndex(index).setType(type).newIndex(); logger.info("... new index created"); final ResourceSink sink = new ResourceSink(es); ImportService service = new ImportService() .threads(threads) .factory( new ImporterFactory() { @Override public Importer newImporter() { return new SpringerCitations(sink); } }) .execute(); logger.info( "finished, number of files = {}, resources indexed = {}", fileCounter, sink.getCounter()); service.shutdown(); logger.info("service shutdown"); es.shutdown(); logger.info("elasticsearch client shutdown"); } catch (IOException | InterruptedException | ExecutionException e) { logger.error(e.getMessage(), e); exitcode = 1; } System.exit(exitcode); } public SpringerCitations(ElementOutput out) { this.out = out; } @Override public void close() throws IOException { // do not clear input } @Override public boolean hasNext() { if (input.isEmpty()) { done = true; } return !done; } @Override public AtomicLong next() { if (done) { return fileCounter; } try { URI uri = input.poll(); if (uri != null) { push(uri); } else { done = true; } fileCounter.incrementAndGet(); } catch (Exception e) { logger.error(e.getMessage(), e); done = true; } return fileCounter; } private IRI FABIO_ARTICLE = IRI.create("fabio:Article"); private IRI FABIO_JOURNAL = IRI.create("fabio:Journal"); private IRI FABIO_PERIODICAL_VOLUME = IRI.create("fabio:PeriodicalVolume"); private IRI FABIO_PERIODICAL_ISSUE = IRI.create("fabio:PeriodicalIssue"); private IRI FABIO_PRINT_OBJECT = IRI.create("fabio:PrintObject"); private IRI FRBR_PARTOF = IRI.create("frbr:partOf"); private IRI FRBR_EMBODIMENT = IRI.create("frbr:embodiment"); private static final TextFileConnectionFactory factory = new TextFileConnectionFactory(); private void push(URI uri) throws Exception { if (uri == null) { return; } InputStream in = factory.open(uri); if (in == null) { throw new IOException("unable to open " + uri); } try (BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8"))) { String title = null; List<String> author = new LinkedList(); String year = null; String journal = null; String issn = null; String volume = null; String issue = null; String pagination = null; String doi = null; String publisher = null; String line; while ((line = reader.readLine()) != null) { if (line.isEmpty()) { continue; } if ('%' != line.charAt(0)) { continue; } char ch = line.charAt(1); switch (ch) { case 'D': { year = line.substring(3).trim(); break; } case 'T': { title = line.substring(3).trim(); break; } case '@': { issn = line.substring(3).trim(); break; } case 'J': { journal = line.substring(3).trim(); break; } case 'A': { author.add(line.substring(3).trim()); break; } case 'V': { volume = line.substring(3).trim(); break; } case 'N': { issue = line.substring(3).trim(); break; } case 'P': { pagination = line.substring(3).trim(); break; } case 'R': { doi = line.substring(3).trim(); break; } case 'I': { publisher = line.substring(3).trim(); break; } case 'U': { // URL (DOI resolver) break; } case 'K': { // keywords break; } case '0': { // record type break; } case '8': { // day break; } case 'G': { // language break; } default: { logger.warn("unknown tag: " + line); } } } // create bibliographic key String key = author.isEmpty() ? null : new WorkAuthor().authorName(author.get(0)).workName(title).createIdentifier(); IRI dereferencable = IRI.builder().scheme("http").host("xbib.info").path("/doi/").fragment(doi).build(); Resource r = resourceContext .newResource() .id(dereferencable) .a(FABIO_ARTICLE) .add("xbib:key", key) .add("prism:doi", doi) .add("dc:title", title); for (String a : author) { r.add("dc:creator", a); } r.add("prism:publicationDate", new SimpleLiteral<>(year).type(Literal.GYEAR)); r.newResource(FRBR_EMBODIMENT).a(FABIO_PERIODICAL_VOLUME).add("prism:volume", volume); r.newResource(FRBR_EMBODIMENT).a(FABIO_PERIODICAL_ISSUE).add("prism:number", issue); r.newResource(FRBR_EMBODIMENT).a(FABIO_PRINT_OBJECT).add("prism:pageRange", pagination); r.newResource(FRBR_PARTOF) .a(FABIO_JOURNAL) .add("prism:publicationName", journal) .add("prism:issn", issn) .add("dc:publisher", publisher); resourceContext .resource() .id( IRI.builder() .scheme("http") .host(index) .query(type) .fragment(resourceContext.resource().id().getFragment()) .build()); out.output(resourceContext, resourceContext.contentBuilder()); } } }
/** A Sax adapter for MarcXchange */ public class MarcXchangeSaxAdapter implements MarcXchangeConstants, MarcXchangeListener { private static final Logger logger = LoggerFactory.getLogger(MarcXchangeSaxAdapter.class.getName()); private static final AttributesImpl EMPTY_ATTRIBUTES = new AttributesImpl(); private static final CharStreamFactory factory = CharStreamFactory.getInstance(); private final CharStreamListener streamListener = new Iso2709StreamListener(); private CharStream stream; private char mark = '\u0000'; private int position = 0; private FieldDirectory directory; private Field designator; private RecordLabel label; private boolean datafieldOpen; private boolean subfieldOpen; private boolean recordOpen; private String schema; private String format; private String type; private String id; private String nsUri; private ContentHandler contentHandler; private MarcXchangeListener listener; private boolean fatalerrors = false; private boolean silenterrors = false; private int buffersize = 8192; public MarcXchangeSaxAdapter() { this.nsUri = NS_URI; this.subfieldOpen = false; this.recordOpen = false; } public MarcXchangeSaxAdapter buffersize(int buffersize) { this.buffersize = buffersize; return this; } public MarcXchangeSaxAdapter inputSource(final InputSource source) throws IOException { if (source.getByteStream() != null) { String encoding = source.getEncoding() != null ? source.getEncoding() : "ANSEL"; Reader reader = new InputStreamReader(source.getByteStream(), encoding); this.stream = factory.newStream(reader, buffersize, streamListener); } else { Reader reader = source.getCharacterStream(); this.stream = factory.newStream(reader, buffersize, streamListener); } return this; } public MarcXchangeSaxAdapter setContentHandler(ContentHandler handler) { this.contentHandler = handler; return this; } public MarcXchangeSaxAdapter setListener(MarcXchangeListener listener) { this.listener = listener; return this; } public MarcXchangeSaxAdapter setSchema(String schema) { this.schema = schema; return this; } public MarcXchangeSaxAdapter setFormat(String format) { this.format = format; return this; } public MarcXchangeSaxAdapter setType(String type) { this.type = type; return this; } public MarcXchangeSaxAdapter setFatalErrors(Boolean fatalerrors) { this.fatalerrors = fatalerrors; return this; } public MarcXchangeSaxAdapter setSilentErrors(Boolean silenterrors) { this.silenterrors = silenterrors; return this; } public String getIdentifier() { return id; } /** Parse ISO 2709 and emit SAX events. */ public void parse() throws IOException, SAXException { beginCollection(); String chunk; do { chunk = stream.readData(); } while (chunk != null); stream.close(); endCollection(); } public void beginCollection() throws SAXException { if (contentHandler == null) { logger.warn("no content handler set"); return; } contentHandler.startDocument(); // write schema info AttributesImpl attrs = new AttributesImpl(); if ("MARC21".equalsIgnoreCase(schema)) { this.nsUri = MARC21_NS_URI; attrs.addAttribute( XMLNS.NS_URI, XSI.NS_PREFIX, XMLNS.NS_PREFIX + ":" + XSI.NS_PREFIX, "CDATA", XSI.NS_URI); attrs.addAttribute( XSI.NS_URI, "schemaLocation", XSI.NS_PREFIX + ":schemaLocation", "CDATA", MARC21_NS_URI + " " + MARC21_SCHEMA); } else { this.nsUri = NS_URI; attrs.addAttribute( XMLNS.NS_URI, XSI.NS_PREFIX, XMLNS.NS_PREFIX + ":" + XSI.NS_PREFIX, "CDATA", XSI.NS_URI); attrs.addAttribute( XSI.NS_URI, "schemaLocation", XSI.NS_PREFIX + ":schemaLocation", "CDATA", NS_URI + " " + MARCXCHANGE_SCHEMA); } contentHandler.startPrefixMapping("", nsUri); contentHandler.startElement(nsUri, COLLECTION, COLLECTION, attrs); } public void endCollection() throws SAXException { if (contentHandler == null) { logger.warn("no content handler set"); return; } contentHandler.endElement(nsUri, COLLECTION, COLLECTION); contentHandler.endDocument(); } @Override public void beginRecord(String format, String type) { if (recordOpen) { return; } try { AttributesImpl attrs = new AttributesImpl(); if (format != null && !"MARC21".equalsIgnoreCase(schema)) { attrs.addAttribute(nsUri, FORMAT, FORMAT, "CDATA", format); } if (type != null) { attrs.addAttribute(nsUri, TYPE, TYPE, "CDATA", type); } if (contentHandler != null) { contentHandler.startElement(nsUri, RECORD, RECORD, attrs); } if (listener != null) { listener.beginRecord(format, type); } this.recordOpen = true; } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } } @Override public void endRecord() { if (!recordOpen) { return; } try { if (listener != null) { listener.endRecord(); } if (contentHandler != null) { contentHandler.endElement(nsUri, RECORD, RECORD); } if (listener != null) { // emit trailer event, drives record output segmentation listener.trailer(null); } this.recordOpen = false; } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } } @Override public void leader(String value) { if (value == null) { return; } try { if (contentHandler != null) { contentHandler.startElement(nsUri, LEADER, LEADER, EMPTY_ATTRIBUTES); contentHandler.characters(value.toCharArray(), 0, value.length()); contentHandler.endElement(nsUri, LEADER, LEADER); } if (listener != null) { listener.leader(value); } } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } } @Override public void trailer(String trailer) { // do nothing, MARC reading defines no trailer } @Override public void beginControlField(Field designator) { if (designator == null) { return; } try { AttributesImpl attrs = new AttributesImpl(); attrs.addAttribute(nsUri, TAG, TAG, "CDATA", designator.tag()); if (contentHandler != null) { contentHandler.startElement(nsUri, CONTROLFIELD, CONTROLFIELD, attrs); } if (listener != null) { listener.beginControlField(designator); } } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } } @Override public void endControlField(Field designator) { try { if (listener != null) { listener.endControlField(designator); } if (designator != null) { String value = designator.data(); if (!value.isEmpty()) { switch (designator.tag()) { case "001": this.id = value; break; case "006": case "007": case "008": // fix fill characters here value = value.replace('^', '|'); break; } if (contentHandler != null) { contentHandler.characters(value.toCharArray(), 0, value.length()); } } } if (contentHandler != null) { contentHandler.endElement(nsUri, CONTROLFIELD, CONTROLFIELD); } } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } } @Override public void beginDataField(Field designator) { if (designator == null) { return; } try { if (designator.isControlField()) { beginControlField(designator); endControlField(designator); return; } if (datafieldOpen) { return; } AttributesImpl attrs = new AttributesImpl(); String tag = designator.tag(); if (tag == null || tag.length() == 0) { tag = Field.NULL_TAG; // fallback designator.tag(tag); } attrs.addAttribute(nsUri, TAG, TAG, "CDATA", tag); int ind = designator.indicator() != null ? designator.indicator().length() : 0; // force at least two default blank indicators if schema is Marc 21 if ("MARC21".equalsIgnoreCase(schema)) { for (int i = (ind == 0 ? 1 : ind); i <= 2; i++) { attrs.addAttribute(null, IND + i, IND + i, "CDATA", " "); } } // set indicators for (int i = 1; i <= ind; i++) { attrs.addAttribute( null, IND + i, IND + i, "CDATA", designator.indicator().substring(i - 1, i)); } if (contentHandler != null) { contentHandler.startElement(nsUri, DATAFIELD, DATAFIELD, attrs); } if (listener != null) { listener.beginDataField(designator); } datafieldOpen = true; } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } } @Override public void endDataField(Field designator) { try { if (!datafieldOpen) { return; } if (listener != null) { listener.endDataField(designator); } if (designator != null) { String value = designator.data(); if (value != null && !value.isEmpty()) { value = normalizeValue(value); // write data field per default into a subfield with code 'a' AttributesImpl attrs = new AttributesImpl(); attrs.addAttribute(nsUri, CODE, CODE, "CDATA", "a"); if (contentHandler != null) { contentHandler.startElement(nsUri, SUBFIELD, SUBFIELD, attrs); contentHandler.characters(value.toCharArray(), 0, value.length()); contentHandler.endElement(nsUri, SUBFIELD, SUBFIELD); } } } if (contentHandler != null) { contentHandler.endElement(NS_URI, DATAFIELD, DATAFIELD); } datafieldOpen = false; } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } } @Override public void beginSubField(Field designator) { if (designator == null) { return; } try { AttributesImpl attrs = new AttributesImpl(); String subfieldId = designator.subfieldId(); if (subfieldId == null || subfieldId.length() == 0) { subfieldId = "a"; // fallback } attrs.addAttribute(nsUri, CODE, CODE, "CDATA", subfieldId); if (contentHandler != null) { contentHandler.startElement(nsUri, SUBFIELD, SUBFIELD, attrs); } if (listener != null) { listener.beginSubField(designator); } } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } } @Override public void endSubField(Field designator) { if (designator == null) { return; } try { if (listener != null) { listener.endSubField(designator); } if (designator != null) { if (contentHandler != null) { String value = designator.data(); if (!value.isEmpty()) { value = normalizeValue(value); contentHandler.characters(value.toCharArray(), 0, value.length()); } } } if (contentHandler != null) { contentHandler.endElement(NS_URI, SUBFIELD, SUBFIELD); } } catch (Exception ex) { if (fatalerrors) { throw new RuntimeException(ex); } else if (!silenterrors) { logger.warn(designator + ": " + ex.getMessage(), ex); } } } protected String normalizeValue(String value) { return XMLUtil.clean(Normalizer.normalize(value, Form.NFC)); } private class Iso2709StreamListener implements CharStreamListener { @Override public void data(String data) { String fieldContent = data; try { switch (mark) { case Separable.FS: // start/end file break; case Separable.GS: // start/end of group within a stream if (subfieldOpen) { // close subfield if open subfieldOpen = false; endDataField(null); } endDataField(designator); endRecord(); // close record // fall through is ok! case '\u0000': // start of stream position = 0; // skip line-feed (OCLC PICA quirk) if (data.charAt(0) == '\n') { fieldContent = data.substring(1); } if (fieldContent.length() >= RecordLabel.LENGTH) { beginRecord(format, type); String labelStr = fieldContent.substring(0, RecordLabel.LENGTH); label = new RecordLabel(labelStr.toCharArray()); // auto-repair label leader(label.getFixed()); directory = new FieldDirectory(label, fieldContent); if (directory.isEmpty()) { designator = new Field(label, fieldContent.substring(RecordLabel.LENGTH)); if (designator.tag() != null) { beginDataField(designator); } } } else { directory = new FieldDirectory(label, fieldContent); designator = new Field(); } break; case Separable.RS: if (subfieldOpen) { subfieldOpen = false; endDataField(null); // force data field close } else if (designator != null && !designator.isEmpty()) { if (datafieldOpen) { endDataField(designator); } } if (directory == null || directory.isEmpty()) { designator = new Field(label, fieldContent); } else if (directory.containsKey(position)) { designator = new Field(label, directory.get(position), fieldContent, false); } else { throw new InvalidFieldDirectoryException( "byte position not found in directory: " + position + " - is this stream reading using an 8-bit wide encoding?"); } if (designator != null) { beginDataField(designator); } break; case Separable.US: if (!subfieldOpen) { subfieldOpen = true; beginDataField(designator); } if (designator != null) { designator = new Field(label, designator, fieldContent, true); beginSubField(designator); } endSubField(designator); break; } } catch (InvalidFieldDirectoryException ex) { logger.warn(ex.getMessage()); } finally { position += data.length(); } } @Override public void markUnit() { mark = Separable.US; position++; } @Override public void markRecord() { mark = Separable.RS; position++; } @Override public void markGroup() { mark = Separable.GS; position++; } @Override public void markFile() { mark = Separable.FS; position++; endDataField(null); endRecord(); } } }