/* * Groups in the regular expression are identified by round brackets. There * are actually 21 groups in the regex. They are defined as follows: * * 0 the whole triple 1 subject 2 anonymous subject 3 resource subject 4 * predicate 5 resource predicate 6 object 7 anonymous subject 8 resource * object 9 literal object 10 literal value 11 string with quotes in literal * value 12 string without quotes in literal value 13 last character in * string 14 string with apostrophes in literal value 15 string without * apostrophes in literal value 16 last character in string 17 datatype or * language 18 datatype with ^^ 19 datatype without ^^ (resource) 20 * language with @ 21 language without @ */ private void parseLine(String line) throws IOException { if (line == null) { eof = true; return; } String s = line.trim(); if (s.length() == 0 || s.startsWith("#")) { return; } Matcher matcher = NTRIPLE_PATTERN.matcher(s); S subject; P predicate; O object; if (!matcher.matches()) { throw new PatternSyntaxException( "The given pattern " + tripleExpression + " doesn't match the expression:", s, -1); } // subject if (matcher.group(2) != null) { subject = (S) simpleFactory.newBlankNode(matcher.group(1)); } else { // resource node String subj = matcher.group(1); IRI subjURI = IRI.create(subj.substring(1, subj.length() - 1)); subject = simpleFactory.asSubject(subjURI); } // predicate String p = matcher.group(4); predicate = (P) IRI.create(p.substring(1, p.length() - 1)); // object if (matcher.group(7) != null) { // anonymous node object = (O) simpleFactory.newBlankNode(matcher.group(6)); } else if (matcher.group(8) != null) { // resource node String obj = matcher.group(6); object = simpleFactory.asObject(IRI.create(obj.substring(1, obj.length() - 1))); } else { // literal node // 10 is without quotes or apostrophs // with quotes or apostrophes. to have the value without them you need to look at groups 12 // and 15 String literal = matcher.group(10); object = (O) simpleFactory.newLiteral(literal); } if (listener != null) { Triple stmt = new SimpleTriple<>(subject, predicate, object); listener.triple(stmt); } }
/** Push Springer citations to Elasticsearch */ public class SpringerCitations extends AbstractImporter<Long, AtomicLong> { private static final Logger logger = LoggerFactory.getLogger(SpringerCitations.class.getName()); private static final String lf = System.getProperty("line.separator"); private static Queue<URI> input; private static final AtomicLong fileCounter = new AtomicLong(0L); private final SimpleResourceContext resourceContext = new SimpleResourceContext(); private static String index; private static String type; private ElementOutput out; private boolean done = false; public static void main(String[] args) { int exitcode = 0; try { OptionParser parser = new OptionParser() { { accepts("elasticsearch").withRequiredArg().ofType(String.class).required(); accepts("index").withRequiredArg().ofType(String.class).required(); accepts("type").withRequiredArg().ofType(String.class).required(); accepts("maxbulkactions").withRequiredArg().ofType(Integer.class).defaultsTo(1000); accepts("maxconcurrentbulkrequests") .withRequiredArg() .ofType(Integer.class) .defaultsTo(4 * Runtime.getRuntime().availableProcessors()); accepts("mock").withOptionalArg().ofType(Boolean.class).defaultsTo(Boolean.FALSE); accepts("path").withRequiredArg().ofType(String.class).required(); accepts("pattern") .withRequiredArg() .ofType(String.class) .required() .defaultsTo("*.txt"); accepts("threads").withRequiredArg().ofType(Integer.class).defaultsTo(1); accepts("help"); } }; final OptionSet options = parser.parse(args); if (options.hasArgument("help")) { System.err.println( "Help for " + Medline.class.getCanonicalName() + lf + " --help print this help message" + lf + " --elasticsearch <uri> Elasticesearch URI" + lf + " --index <index> Elasticsearch index name" + lf + " --type <type> Elasticsearch type name" + lf + " --maxbulkactions <n> the number of bulk actions per request (optional, default: 1000)" + " --maxconcurrentbulkrequests <n>the number of concurrent bulk requests (optional, default: 4 * cpu cores)" + " --path <path> a file path from where the input files are recursively collected (required)" + lf + " --pattern <pattern> a regex for selecting matching file names for input (default: *.txt)" + lf + " --threads <n> the number of threads (optional, default: <num-of=cpus)"); System.exit(1); } input = new Finder((String) options.valueOf("pattern")) .find((String) options.valueOf("path")) .getURIs(); final Integer threads = (Integer) options.valueOf("threads"); logger.info("found {} input files", input.size()); URI esURI = URI.create((String) options.valueOf("elasticsearch")); index = (String) options.valueOf("index"); type = (String) options.valueOf("type"); int maxbulkactions = (Integer) options.valueOf("maxbulkactions"); int maxconcurrentbulkrequests = (Integer) options.valueOf("maxconcurrentbulkrequests"); boolean mock = (Boolean) options.valueOf("mock"); final IngestClient es = mock ? new MockIngestClient() : new IngestClient(); es.maxBulkActions(maxbulkactions) .maxConcurrentBulkRequests(maxconcurrentbulkrequests) .newClient(esURI) .waitForCluster(ClusterHealthStatus.YELLOW, TimeValue.timeValueSeconds(30)); logger.info("creating new index ..."); es.setIndex(index).setType(type).newIndex(); logger.info("... new index created"); final ResourceSink sink = new ResourceSink(es); ImportService service = new ImportService() .threads(threads) .factory( new ImporterFactory() { @Override public Importer newImporter() { return new SpringerCitations(sink); } }) .execute(); logger.info( "finished, number of files = {}, resources indexed = {}", fileCounter, sink.getCounter()); service.shutdown(); logger.info("service shutdown"); es.shutdown(); logger.info("elasticsearch client shutdown"); } catch (IOException | InterruptedException | ExecutionException e) { logger.error(e.getMessage(), e); exitcode = 1; } System.exit(exitcode); } public SpringerCitations(ElementOutput out) { this.out = out; } @Override public void close() throws IOException { // do not clear input } @Override public boolean hasNext() { if (input.isEmpty()) { done = true; } return !done; } @Override public AtomicLong next() { if (done) { return fileCounter; } try { URI uri = input.poll(); if (uri != null) { push(uri); } else { done = true; } fileCounter.incrementAndGet(); } catch (Exception e) { logger.error(e.getMessage(), e); done = true; } return fileCounter; } private IRI FABIO_ARTICLE = IRI.create("fabio:Article"); private IRI FABIO_JOURNAL = IRI.create("fabio:Journal"); private IRI FABIO_PERIODICAL_VOLUME = IRI.create("fabio:PeriodicalVolume"); private IRI FABIO_PERIODICAL_ISSUE = IRI.create("fabio:PeriodicalIssue"); private IRI FABIO_PRINT_OBJECT = IRI.create("fabio:PrintObject"); private IRI FRBR_PARTOF = IRI.create("frbr:partOf"); private IRI FRBR_EMBODIMENT = IRI.create("frbr:embodiment"); private static final TextFileConnectionFactory factory = new TextFileConnectionFactory(); private void push(URI uri) throws Exception { if (uri == null) { return; } InputStream in = factory.open(uri); if (in == null) { throw new IOException("unable to open " + uri); } try (BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8"))) { String title = null; List<String> author = new LinkedList(); String year = null; String journal = null; String issn = null; String volume = null; String issue = null; String pagination = null; String doi = null; String publisher = null; String line; while ((line = reader.readLine()) != null) { if (line.isEmpty()) { continue; } if ('%' != line.charAt(0)) { continue; } char ch = line.charAt(1); switch (ch) { case 'D': { year = line.substring(3).trim(); break; } case 'T': { title = line.substring(3).trim(); break; } case '@': { issn = line.substring(3).trim(); break; } case 'J': { journal = line.substring(3).trim(); break; } case 'A': { author.add(line.substring(3).trim()); break; } case 'V': { volume = line.substring(3).trim(); break; } case 'N': { issue = line.substring(3).trim(); break; } case 'P': { pagination = line.substring(3).trim(); break; } case 'R': { doi = line.substring(3).trim(); break; } case 'I': { publisher = line.substring(3).trim(); break; } case 'U': { // URL (DOI resolver) break; } case 'K': { // keywords break; } case '0': { // record type break; } case '8': { // day break; } case 'G': { // language break; } default: { logger.warn("unknown tag: " + line); } } } // create bibliographic key String key = author.isEmpty() ? null : new WorkAuthor().authorName(author.get(0)).workName(title).createIdentifier(); IRI dereferencable = IRI.builder().scheme("http").host("xbib.info").path("/doi/").fragment(doi).build(); Resource r = resourceContext .newResource() .id(dereferencable) .a(FABIO_ARTICLE) .add("xbib:key", key) .add("prism:doi", doi) .add("dc:title", title); for (String a : author) { r.add("dc:creator", a); } r.add("prism:publicationDate", new SimpleLiteral<>(year).type(Literal.GYEAR)); r.newResource(FRBR_EMBODIMENT).a(FABIO_PERIODICAL_VOLUME).add("prism:volume", volume); r.newResource(FRBR_EMBODIMENT).a(FABIO_PERIODICAL_ISSUE).add("prism:number", issue); r.newResource(FRBR_EMBODIMENT).a(FABIO_PRINT_OBJECT).add("prism:pageRange", pagination); r.newResource(FRBR_PARTOF) .a(FABIO_JOURNAL) .add("prism:publicationName", journal) .add("prism:issn", issn) .add("dc:publisher", publisher); resourceContext .resource() .id( IRI.builder() .scheme("http") .host(index) .query(type) .fragment(resourceContext.resource().id().getFragment()) .build()); out.output(resourceContext, resourceContext.contentBuilder()); } } }