private MultiPatternMatcher<CoreMap> createPatternMatcher(
     Map<SequencePattern<CoreMap>, Entry> patternToEntry) {
   // Convert to tokensregex pattern
   int patternFlags = ignoreCase ? Pattern.CASE_INSENSITIVE : 0;
   int stringMatchFlags = ignoreCase ? NodePattern.CASE_INSENSITIVE : 0;
   Env env = TokenSequencePattern.getNewEnv();
   env.setDefaultStringPatternFlags(patternFlags);
   env.setDefaultStringMatchFlags(stringMatchFlags);
   NodePattern<String> posTagPattern =
       (validPosPattern != null && PosMatchType.MATCH_ALL_TOKENS.equals(posMatchType))
           ? new CoreMapNodePattern.StringAnnotationRegexPattern(validPosPattern)
           : null;
   List<TokenSequencePattern> patterns = new ArrayList<>(entries.size());
   for (Entry entry : entries) {
     TokenSequencePattern pattern;
     if (entry.tokensRegex != null) {
       // TODO: posTagPatterns...
       pattern = TokenSequencePattern.compile(env, entry.tokensRegex);
     } else {
       List<SequencePattern.PatternExpr> nodePatterns = new ArrayList<>();
       for (String p : entry.regex) {
         CoreMapNodePattern c = CoreMapNodePattern.valueOf(p, patternFlags);
         if (posTagPattern != null) {
           c.add(CoreAnnotations.PartOfSpeechAnnotation.class, posTagPattern);
         }
         nodePatterns.add(new SequencePattern.NodePatternExpr(c));
       }
       pattern =
           TokenSequencePattern.compile(new SequencePattern.SequencePatternExpr(nodePatterns));
     }
     if (entry.annotateGroup < 0 || entry.annotateGroup > pattern.getTotalGroups()) {
       throw new RuntimeException("Invalid match group for entry " + entry);
     }
     pattern.setPriority(entry.priority);
     patterns.add(pattern);
     patternToEntry.put(pattern, entry);
   }
   return TokenSequencePattern.getMultiPatternMatcher(patterns);
 }
 public WordsToSentencesAnnotator(
     boolean verbose,
     String boundaryTokenRegex,
     Set<String> boundaryToDiscard,
     Set<String> htmlElementsToDiscard,
     String newlineIsSentenceBreak,
     String boundaryMultiTokenRegex,
     Set<String> tokenRegexesToDiscard) {
   this(
       verbose,
       false,
       new WordToSentenceProcessor<CoreLabel>(
           boundaryTokenRegex,
           boundaryToDiscard,
           htmlElementsToDiscard,
           WordToSentenceProcessor.stringToNewlineIsSentenceBreak(newlineIsSentenceBreak),
           (boundaryMultiTokenRegex != null)
               ? TokenSequencePattern.compile(boundaryMultiTokenRegex)
               : null,
           tokenRegexesToDiscard));
 }
示例#3
0
    @Override
    public void handle(HttpExchange httpExchange) throws IOException {
      // Set common response headers
      httpExchange.getResponseHeaders().add("Access-Control-Allow-Origin", "*");

      Future<String> json =
          corenlpExecutor.submit(
              () -> {
                try {
                  // Get the document
                  Properties props =
                      new Properties() {
                        {
                          setProperty("annotators", "tokenize,ssplit,pos,lemma,ner");
                        }
                      };
                  Annotation doc = getDocument(props, httpExchange);
                  if (!doc.containsKey(CoreAnnotations.SentencesAnnotation.class)) {
                    StanfordCoreNLP pipeline = mkStanfordCoreNLP(props);
                    pipeline.annotate(doc);
                  }

                  // Construct the matcher
                  Map<String, String> params = getURLParams(httpExchange.getRequestURI());
                  // (get the pattern)
                  if (!params.containsKey("pattern")) {
                    respondError("Missing required parameter 'pattern'", httpExchange);
                    return "";
                  }
                  String pattern = params.get("pattern");
                  // (get whether to filter / find)
                  String filterStr = params.getOrDefault("filter", "false");
                  final boolean filter =
                      filterStr.trim().isEmpty()
                          || "true".equalsIgnoreCase(filterStr.toLowerCase());
                  // (create the matcher)
                  final TokenSequencePattern regex = TokenSequencePattern.compile(pattern);

                  // Run TokensRegex
                  return JSONOutputter.JSONWriter.objectToJSON(
                      (docWriter) -> {
                        if (filter) {
                          // Case: just filter sentences
                          docWriter.set(
                              "sentences",
                              doc.get(CoreAnnotations.SentencesAnnotation.class)
                                  .stream()
                                  .map(
                                      sentence ->
                                          regex
                                              .matcher(
                                                  sentence.get(
                                                      CoreAnnotations.TokensAnnotation.class))
                                              .matches())
                                  .collect(Collectors.toList()));
                        } else {
                          // Case: find matches
                          docWriter.set(
                              "sentences",
                              doc.get(CoreAnnotations.SentencesAnnotation.class)
                                  .stream()
                                  .map(
                                      sentence ->
                                          (Consumer<JSONOutputter.Writer>)
                                              (JSONOutputter.Writer sentWriter) -> {
                                                List<CoreLabel> tokens =
                                                    sentence.get(
                                                        CoreAnnotations.TokensAnnotation.class);
                                                TokenSequenceMatcher matcher =
                                                    regex.matcher(tokens);
                                                int i = 0;
                                                while (matcher.find()) {
                                                  sentWriter.set(
                                                      Integer.toString(i),
                                                      (Consumer<JSONOutputter.Writer>)
                                                          (JSONOutputter.Writer matchWriter) -> {
                                                            matchWriter.set(
                                                                "text", matcher.group());
                                                            matchWriter.set(
                                                                "begin", matcher.start());
                                                            matchWriter.set("end", matcher.end());
                                                            for (int groupI = 0;
                                                                groupI < matcher.groupCount();
                                                                ++groupI) {
                                                              SequenceMatchResult.MatchedGroupInfo<
                                                                      CoreMap>
                                                                  info =
                                                                      matcher.groupInfo(groupI + 1);
                                                              matchWriter.set(
                                                                  info.varName == null
                                                                      ? Integer.toString(groupI + 1)
                                                                      : info.varName,
                                                                  (Consumer<JSONOutputter.Writer>)
                                                                      groupWriter -> {
                                                                        groupWriter.set(
                                                                            "text", info.text);
                                                                        if (info.nodes.size() > 0) {
                                                                          groupWriter.set(
                                                                              "begin",
                                                                              info.nodes
                                                                                      .get(0)
                                                                                      .get(
                                                                                          CoreAnnotations
                                                                                              .IndexAnnotation
                                                                                              .class)
                                                                                  - 1);
                                                                          groupWriter.set(
                                                                              "end",
                                                                              info.nodes
                                                                                  .get(
                                                                                      info.nodes
                                                                                              .size()
                                                                                          - 1)
                                                                                  .get(
                                                                                      CoreAnnotations
                                                                                          .IndexAnnotation
                                                                                          .class));
                                                                        }
                                                                      });
                                                            }
                                                          });
                                                  i += 1;
                                                }
                                                sentWriter.set("length", i);
                                              }));
                        }
                      });
                } catch (Exception e) {
                  e.printStackTrace();
                  try {
                    respondError(e.getClass().getName() + ": " + e.getMessage(), httpExchange);
                  } catch (IOException ignored) {
                  }
                }
                return "";
              });

      // Send response
      byte[] response = new byte[0];
      try {
        response = json.get(5, TimeUnit.SECONDS).getBytes();
      } catch (InterruptedException | ExecutionException | TimeoutException e) {
        respondError("Timeout when executing TokensRegex query", httpExchange);
      }
      if (response.length > 0) {
        httpExchange.getResponseHeaders().add("Content-Type", "text/json");
        httpExchange.getResponseHeaders().add("Content-Length", Integer.toString(response.length));
        httpExchange.sendResponseHeaders(HTTP_OK, response.length);
        httpExchange.getResponseBody().write(response);
        httpExchange.close();
      }
    }
  @Override
  protected List<Attribute> init(
      AbstractDefinition abstractDefinition,
      ExpressionExecutor[] attributeExpressionExecutors,
      ExecutionPlanContext executionPlanContext) {
    if (logger.isDebugEnabled()) {
      logger.debug("Initializing Query ...");
    }

    if (attributeExpressionLength < 2) {
      throw new ExecutionPlanCreationException(
          "Query expects at least two parameters. Received only "
              + attributeExpressionLength
              + ".\nUsage: #nlp.findTokensRegexPattern(regex:string, text:string-variable)");
    }

    String regex;
    try {
      if (attributeExpressionExecutors[0] instanceof ConstantExpressionExecutor) {
        regex = (String) attributeExpressionExecutors[0].execute(null);
      } else {
        throw new ExecutionPlanCreationException(
            "First parameter should be a constant."
                + ".\nUsage: #nlp.findTokensRegexPattern(regex:string, text:string-variable)");
      }
    } catch (ClassCastException e) {
      throw new ExecutionPlanCreationException(
          "First parameter should be of type string. Found "
              + attributeExpressionExecutors[0].getReturnType()
              + ".\nUsage: #nlp.findTokensRegexPattern(regex:string, text:string-variable)");
    }

    try {
      regexPattern = TokenSequencePattern.compile(regex);
    } catch (Exception e) {
      throw new ExecutionPlanCreationException("Cannot parse given regex " + regex, e);
    }

    if (!(attributeExpressionExecutors[1] instanceof VariableExpressionExecutor)) {
      throw new ExecutionPlanCreationException(
          "Second parameter should be a variable."
              + ".\nUsage: #nlp.findTokensRegexPattern(regex:string, text:string-variable)");
    }

    if (logger.isDebugEnabled()) {
      logger.debug(
          String.format(
              "Query parameters initialized. Regex: %s Stream Parameters: %s",
              regex, abstractDefinition.getAttributeList()));
    }

    initPipeline();

    ArrayList<Attribute> attributes = new ArrayList<Attribute>(1);

    attributes.add(new Attribute("match", Attribute.Type.STRING));
    attributeCount = regexPattern.getTotalGroups();
    for (int i = 1; i < attributeCount; i++) {
      attributes.add(new Attribute(groupPrefix + i, Attribute.Type.STRING));
    }
    return attributes;
  }