コード例 #1
0
ファイル: NTripleReader.java プロジェクト: pombredanne/xbib
 /*
  * Groups in the regular expression are identified by round brackets. There
  * are actually 21 groups in the regex. They are defined as follows:
  *
  * 0	the whole triple 1	subject 2	anonymous subject 3	resource subject 4
  * predicate 5	resource predicate 6	object 7	anonymous subject 8	resource
  * object 9	literal object 10	literal value 11	string with quotes in literal
  * value 12	string without quotes in literal value 13	last character in
  * string 14	string with apostrophes in literal value 15	string without
  * apostrophes in literal value 16	last character in string 17	datatype or
  * language 18	datatype with ^^ 19	datatype without ^^ (resource) 20
  * language with @ 21	language without @
  */
 private void parseLine(String line) throws IOException {
   if (line == null) {
     eof = true;
     return;
   }
   String s = line.trim();
   if (s.length() == 0 || s.startsWith("#")) {
     return;
   }
   Matcher matcher = NTRIPLE_PATTERN.matcher(s);
   S subject;
   P predicate;
   O object;
   if (!matcher.matches()) {
     throw new PatternSyntaxException(
         "The given pattern " + tripleExpression + " doesn't match the expression:", s, -1);
   }
   // subject
   if (matcher.group(2) != null) {
     subject = (S) simpleFactory.newBlankNode(matcher.group(1));
   } else {
     // resource node
     String subj = matcher.group(1);
     IRI subjURI = IRI.create(subj.substring(1, subj.length() - 1));
     subject = simpleFactory.asSubject(subjURI);
   }
   // predicate
   String p = matcher.group(4);
   predicate = (P) IRI.create(p.substring(1, p.length() - 1));
   // object
   if (matcher.group(7) != null) {
     // anonymous node
     object = (O) simpleFactory.newBlankNode(matcher.group(6));
   } else if (matcher.group(8) != null) {
     // resource node
     String obj = matcher.group(6);
     object = simpleFactory.asObject(IRI.create(obj.substring(1, obj.length() - 1)));
   } else {
     // literal node
     // 10 is without quotes or apostrophs
     // with quotes or apostrophes. to have the value without them you need to look at groups 12
     // and 15
     String literal = matcher.group(10);
     object = (O) simpleFactory.newLiteral(literal);
   }
   if (listener != null) {
     Triple stmt = new SimpleTriple<>(subject, predicate, object);
     listener.triple(stmt);
   }
 }
コード例 #2
0
/** Push Springer citations to Elasticsearch */
public class SpringerCitations extends AbstractImporter<Long, AtomicLong> {

  private static final Logger logger = LoggerFactory.getLogger(SpringerCitations.class.getName());

  private static final String lf = System.getProperty("line.separator");

  private static Queue<URI> input;

  private static final AtomicLong fileCounter = new AtomicLong(0L);

  private final SimpleResourceContext resourceContext = new SimpleResourceContext();

  private static String index;

  private static String type;

  private ElementOutput out;

  private boolean done = false;

  public static void main(String[] args) {
    int exitcode = 0;
    try {
      OptionParser parser =
          new OptionParser() {
            {
              accepts("elasticsearch").withRequiredArg().ofType(String.class).required();
              accepts("index").withRequiredArg().ofType(String.class).required();
              accepts("type").withRequiredArg().ofType(String.class).required();
              accepts("maxbulkactions").withRequiredArg().ofType(Integer.class).defaultsTo(1000);
              accepts("maxconcurrentbulkrequests")
                  .withRequiredArg()
                  .ofType(Integer.class)
                  .defaultsTo(4 * Runtime.getRuntime().availableProcessors());
              accepts("mock").withOptionalArg().ofType(Boolean.class).defaultsTo(Boolean.FALSE);
              accepts("path").withRequiredArg().ofType(String.class).required();
              accepts("pattern")
                  .withRequiredArg()
                  .ofType(String.class)
                  .required()
                  .defaultsTo("*.txt");
              accepts("threads").withRequiredArg().ofType(Integer.class).defaultsTo(1);
              accepts("help");
            }
          };
      final OptionSet options = parser.parse(args);
      if (options.hasArgument("help")) {
        System.err.println(
            "Help for "
                + Medline.class.getCanonicalName()
                + lf
                + " --help                 print this help message"
                + lf
                + " --elasticsearch <uri>  Elasticesearch URI"
                + lf
                + " --index <index>        Elasticsearch index name"
                + lf
                + " --type <type>          Elasticsearch type name"
                + lf
                + " --maxbulkactions <n>   the number of bulk actions per request (optional, default: 1000)"
                + " --maxconcurrentbulkrequests <n>the number of concurrent bulk requests (optional, default: 4 * cpu cores)"
                + " --path <path>          a file path from where the input files are recursively collected (required)"
                + lf
                + " --pattern <pattern>    a regex for selecting matching file names for input (default: *.txt)"
                + lf
                + " --threads <n>          the number of threads (optional, default: <num-of=cpus)");
        System.exit(1);
      }
      input =
          new Finder((String) options.valueOf("pattern"))
              .find((String) options.valueOf("path"))
              .getURIs();
      final Integer threads = (Integer) options.valueOf("threads");

      logger.info("found {} input files", input.size());

      URI esURI = URI.create((String) options.valueOf("elasticsearch"));
      index = (String) options.valueOf("index");
      type = (String) options.valueOf("type");
      int maxbulkactions = (Integer) options.valueOf("maxbulkactions");
      int maxconcurrentbulkrequests = (Integer) options.valueOf("maxconcurrentbulkrequests");
      boolean mock = (Boolean) options.valueOf("mock");

      final IngestClient es = mock ? new MockIngestClient() : new IngestClient();

      es.maxBulkActions(maxbulkactions)
          .maxConcurrentBulkRequests(maxconcurrentbulkrequests)
          .newClient(esURI)
          .waitForCluster(ClusterHealthStatus.YELLOW, TimeValue.timeValueSeconds(30));

      logger.info("creating new index ...");
      es.setIndex(index).setType(type).newIndex();
      logger.info("... new index created");

      final ResourceSink sink = new ResourceSink(es);

      ImportService service =
          new ImportService()
              .threads(threads)
              .factory(
                  new ImporterFactory() {
                    @Override
                    public Importer newImporter() {
                      return new SpringerCitations(sink);
                    }
                  })
              .execute();

      logger.info(
          "finished, number of files = {}, resources indexed = {}", fileCounter, sink.getCounter());

      service.shutdown();
      logger.info("service shutdown");

      es.shutdown();
      logger.info("elasticsearch client shutdown");

    } catch (IOException | InterruptedException | ExecutionException e) {
      logger.error(e.getMessage(), e);
      exitcode = 1;
    }
    System.exit(exitcode);
  }

  public SpringerCitations(ElementOutput out) {
    this.out = out;
  }

  @Override
  public void close() throws IOException {
    // do not clear input
  }

  @Override
  public boolean hasNext() {
    if (input.isEmpty()) {
      done = true;
    }
    return !done;
  }

  @Override
  public AtomicLong next() {
    if (done) {
      return fileCounter;
    }
    try {
      URI uri = input.poll();
      if (uri != null) {
        push(uri);
      } else {
        done = true;
      }
      fileCounter.incrementAndGet();
    } catch (Exception e) {
      logger.error(e.getMessage(), e);
      done = true;
    }
    return fileCounter;
  }

  private IRI FABIO_ARTICLE = IRI.create("fabio:Article");

  private IRI FABIO_JOURNAL = IRI.create("fabio:Journal");

  private IRI FABIO_PERIODICAL_VOLUME = IRI.create("fabio:PeriodicalVolume");

  private IRI FABIO_PERIODICAL_ISSUE = IRI.create("fabio:PeriodicalIssue");

  private IRI FABIO_PRINT_OBJECT = IRI.create("fabio:PrintObject");

  private IRI FRBR_PARTOF = IRI.create("frbr:partOf");

  private IRI FRBR_EMBODIMENT = IRI.create("frbr:embodiment");

  private static final TextFileConnectionFactory factory = new TextFileConnectionFactory();

  private void push(URI uri) throws Exception {
    if (uri == null) {
      return;
    }
    InputStream in = factory.open(uri);
    if (in == null) {
      throw new IOException("unable to open " + uri);
    }
    try (BufferedReader reader = new BufferedReader(new InputStreamReader(in, "UTF-8"))) {
      String title = null;
      List<String> author = new LinkedList();
      String year = null;
      String journal = null;
      String issn = null;
      String volume = null;
      String issue = null;
      String pagination = null;
      String doi = null;
      String publisher = null;
      String line;
      while ((line = reader.readLine()) != null) {
        if (line.isEmpty()) {
          continue;
        }
        if ('%' != line.charAt(0)) {
          continue;
        }
        char ch = line.charAt(1);
        switch (ch) {
          case 'D':
            {
              year = line.substring(3).trim();
              break;
            }
          case 'T':
            {
              title = line.substring(3).trim();
              break;
            }
          case '@':
            {
              issn = line.substring(3).trim();
              break;
            }
          case 'J':
            {
              journal = line.substring(3).trim();
              break;
            }
          case 'A':
            {
              author.add(line.substring(3).trim());
              break;
            }
          case 'V':
            {
              volume = line.substring(3).trim();
              break;
            }
          case 'N':
            {
              issue = line.substring(3).trim();
              break;
            }
          case 'P':
            {
              pagination = line.substring(3).trim();
              break;
            }
          case 'R':
            {
              doi = line.substring(3).trim();
              break;
            }
          case 'I':
            {
              publisher = line.substring(3).trim();
              break;
            }
          case 'U':
            {
              // URL (DOI resolver)
              break;
            }
          case 'K':
            {
              // keywords
              break;
            }
          case '0':
            {
              // record type
              break;
            }
          case '8':
            {
              // day
              break;
            }
          case 'G':
            {
              // language
              break;
            }
          default:
            {
              logger.warn("unknown tag: " + line);
            }
        }
      }
      // create bibliographic key

      String key =
          author.isEmpty()
              ? null
              : new WorkAuthor().authorName(author.get(0)).workName(title).createIdentifier();

      IRI dereferencable =
          IRI.builder().scheme("http").host("xbib.info").path("/doi/").fragment(doi).build();

      Resource r =
          resourceContext
              .newResource()
              .id(dereferencable)
              .a(FABIO_ARTICLE)
              .add("xbib:key", key)
              .add("prism:doi", doi)
              .add("dc:title", title);
      for (String a : author) {
        r.add("dc:creator", a);
      }
      r.add("prism:publicationDate", new SimpleLiteral<>(year).type(Literal.GYEAR));
      r.newResource(FRBR_EMBODIMENT).a(FABIO_PERIODICAL_VOLUME).add("prism:volume", volume);
      r.newResource(FRBR_EMBODIMENT).a(FABIO_PERIODICAL_ISSUE).add("prism:number", issue);
      r.newResource(FRBR_EMBODIMENT).a(FABIO_PRINT_OBJECT).add("prism:pageRange", pagination);
      r.newResource(FRBR_PARTOF)
          .a(FABIO_JOURNAL)
          .add("prism:publicationName", journal)
          .add("prism:issn", issn)
          .add("dc:publisher", publisher);
      resourceContext
          .resource()
          .id(
              IRI.builder()
                  .scheme("http")
                  .host(index)
                  .query(type)
                  .fragment(resourceContext.resource().id().getFragment())
                  .build());
      out.output(resourceContext, resourceContext.contentBuilder());
    }
  }
}