private MultiPatternMatcher<CoreMap> createPatternMatcher( Map<SequencePattern<CoreMap>, Entry> patternToEntry) { // Convert to tokensregex pattern int patternFlags = ignoreCase ? Pattern.CASE_INSENSITIVE : 0; int stringMatchFlags = ignoreCase ? NodePattern.CASE_INSENSITIVE : 0; Env env = TokenSequencePattern.getNewEnv(); env.setDefaultStringPatternFlags(patternFlags); env.setDefaultStringMatchFlags(stringMatchFlags); NodePattern<String> posTagPattern = (validPosPattern != null && PosMatchType.MATCH_ALL_TOKENS.equals(posMatchType)) ? new CoreMapNodePattern.StringAnnotationRegexPattern(validPosPattern) : null; List<TokenSequencePattern> patterns = new ArrayList<>(entries.size()); for (Entry entry : entries) { TokenSequencePattern pattern; if (entry.tokensRegex != null) { // TODO: posTagPatterns... pattern = TokenSequencePattern.compile(env, entry.tokensRegex); } else { List<SequencePattern.PatternExpr> nodePatterns = new ArrayList<>(); for (String p : entry.regex) { CoreMapNodePattern c = CoreMapNodePattern.valueOf(p, patternFlags); if (posTagPattern != null) { c.add(CoreAnnotations.PartOfSpeechAnnotation.class, posTagPattern); } nodePatterns.add(new SequencePattern.NodePatternExpr(c)); } pattern = TokenSequencePattern.compile(new SequencePattern.SequencePatternExpr(nodePatterns)); } if (entry.annotateGroup < 0 || entry.annotateGroup > pattern.getTotalGroups()) { throw new RuntimeException("Invalid match group for entry " + entry); } pattern.setPriority(entry.priority); patterns.add(pattern); patternToEntry.put(pattern, entry); } return TokenSequencePattern.getMultiPatternMatcher(patterns); }
public WordsToSentencesAnnotator( boolean verbose, String boundaryTokenRegex, Set<String> boundaryToDiscard, Set<String> htmlElementsToDiscard, String newlineIsSentenceBreak, String boundaryMultiTokenRegex, Set<String> tokenRegexesToDiscard) { this( verbose, false, new WordToSentenceProcessor<CoreLabel>( boundaryTokenRegex, boundaryToDiscard, htmlElementsToDiscard, WordToSentenceProcessor.stringToNewlineIsSentenceBreak(newlineIsSentenceBreak), (boundaryMultiTokenRegex != null) ? TokenSequencePattern.compile(boundaryMultiTokenRegex) : null, tokenRegexesToDiscard)); }
@Override public void handle(HttpExchange httpExchange) throws IOException { // Set common response headers httpExchange.getResponseHeaders().add("Access-Control-Allow-Origin", "*"); Future<String> json = corenlpExecutor.submit( () -> { try { // Get the document Properties props = new Properties() { { setProperty("annotators", "tokenize,ssplit,pos,lemma,ner"); } }; Annotation doc = getDocument(props, httpExchange); if (!doc.containsKey(CoreAnnotations.SentencesAnnotation.class)) { StanfordCoreNLP pipeline = mkStanfordCoreNLP(props); pipeline.annotate(doc); } // Construct the matcher Map<String, String> params = getURLParams(httpExchange.getRequestURI()); // (get the pattern) if (!params.containsKey("pattern")) { respondError("Missing required parameter 'pattern'", httpExchange); return ""; } String pattern = params.get("pattern"); // (get whether to filter / find) String filterStr = params.getOrDefault("filter", "false"); final boolean filter = filterStr.trim().isEmpty() || "true".equalsIgnoreCase(filterStr.toLowerCase()); // (create the matcher) final TokenSequencePattern regex = TokenSequencePattern.compile(pattern); // Run TokensRegex return JSONOutputter.JSONWriter.objectToJSON( (docWriter) -> { if (filter) { // Case: just filter sentences docWriter.set( "sentences", doc.get(CoreAnnotations.SentencesAnnotation.class) .stream() .map( sentence -> regex .matcher( sentence.get( CoreAnnotations.TokensAnnotation.class)) .matches()) .collect(Collectors.toList())); } else { // Case: find matches docWriter.set( "sentences", doc.get(CoreAnnotations.SentencesAnnotation.class) .stream() .map( sentence -> (Consumer<JSONOutputter.Writer>) (JSONOutputter.Writer sentWriter) -> { List<CoreLabel> tokens = sentence.get( CoreAnnotations.TokensAnnotation.class); TokenSequenceMatcher matcher = regex.matcher(tokens); int i = 0; while (matcher.find()) { sentWriter.set( Integer.toString(i), (Consumer<JSONOutputter.Writer>) (JSONOutputter.Writer matchWriter) -> { matchWriter.set( "text", matcher.group()); matchWriter.set( "begin", matcher.start()); matchWriter.set("end", matcher.end()); for (int groupI = 0; groupI < matcher.groupCount(); ++groupI) { SequenceMatchResult.MatchedGroupInfo< CoreMap> info = matcher.groupInfo(groupI + 1); matchWriter.set( info.varName == null ? Integer.toString(groupI + 1) : info.varName, (Consumer<JSONOutputter.Writer>) groupWriter -> { groupWriter.set( "text", info.text); if (info.nodes.size() > 0) { groupWriter.set( "begin", info.nodes .get(0) .get( CoreAnnotations .IndexAnnotation .class) - 1); groupWriter.set( "end", info.nodes .get( info.nodes .size() - 1) .get( CoreAnnotations .IndexAnnotation .class)); } }); } }); i += 1; } sentWriter.set("length", i); })); } }); } catch (Exception e) { e.printStackTrace(); try { respondError(e.getClass().getName() + ": " + e.getMessage(), httpExchange); } catch (IOException ignored) { } } return ""; }); // Send response byte[] response = new byte[0]; try { response = json.get(5, TimeUnit.SECONDS).getBytes(); } catch (InterruptedException | ExecutionException | TimeoutException e) { respondError("Timeout when executing TokensRegex query", httpExchange); } if (response.length > 0) { httpExchange.getResponseHeaders().add("Content-Type", "text/json"); httpExchange.getResponseHeaders().add("Content-Length", Integer.toString(response.length)); httpExchange.sendResponseHeaders(HTTP_OK, response.length); httpExchange.getResponseBody().write(response); httpExchange.close(); } }
@Override protected List<Attribute> init( AbstractDefinition abstractDefinition, ExpressionExecutor[] attributeExpressionExecutors, ExecutionPlanContext executionPlanContext) { if (logger.isDebugEnabled()) { logger.debug("Initializing Query ..."); } if (attributeExpressionLength < 2) { throw new ExecutionPlanCreationException( "Query expects at least two parameters. Received only " + attributeExpressionLength + ".\nUsage: #nlp.findTokensRegexPattern(regex:string, text:string-variable)"); } String regex; try { if (attributeExpressionExecutors[0] instanceof ConstantExpressionExecutor) { regex = (String) attributeExpressionExecutors[0].execute(null); } else { throw new ExecutionPlanCreationException( "First parameter should be a constant." + ".\nUsage: #nlp.findTokensRegexPattern(regex:string, text:string-variable)"); } } catch (ClassCastException e) { throw new ExecutionPlanCreationException( "First parameter should be of type string. Found " + attributeExpressionExecutors[0].getReturnType() + ".\nUsage: #nlp.findTokensRegexPattern(regex:string, text:string-variable)"); } try { regexPattern = TokenSequencePattern.compile(regex); } catch (Exception e) { throw new ExecutionPlanCreationException("Cannot parse given regex " + regex, e); } if (!(attributeExpressionExecutors[1] instanceof VariableExpressionExecutor)) { throw new ExecutionPlanCreationException( "Second parameter should be a variable." + ".\nUsage: #nlp.findTokensRegexPattern(regex:string, text:string-variable)"); } if (logger.isDebugEnabled()) { logger.debug( String.format( "Query parameters initialized. Regex: %s Stream Parameters: %s", regex, abstractDefinition.getAttributeList())); } initPipeline(); ArrayList<Attribute> attributes = new ArrayList<Attribute>(1); attributes.add(new Attribute("match", Attribute.Type.STRING)); attributeCount = regexPattern.getTotalGroups(); for (int i = 1; i < attributeCount; i++) { attributes.add(new Attribute(groupPrefix + i, Attribute.Type.STRING)); } return attributes; }