// carry out the actual annotations on the given span of text in the // document. protected void annotateText(Document doc, AnnotationSet outputAS, long from, long to) { String text = ""; try { text = doc.getContent().getContent(from, to).toString(); } catch (InvalidOffsetException ex) { throw new GateRuntimeException("Unexpected offset exception, offsets are " + from + "/" + to); } // send the text to the service and get back the response // System.out.println("Annotating text: "+text); // System.out.println("Starting offset is "+from); // NOTE: there is a bug in the TagMe service which causes offset errors // if we use the tweet mode and there are certain patterns in the tweet. // The approach recommended by Francesco Piccinno is to replace those // patterns by spaces. if (getIsTweet()) { logger.debug("Text before cleaning: >>" + text + "<<"); // replace text = text.replaceAll(patternStringRT3, " "); text = text.replaceAll(patternStringRT2, " "); text = text.replaceAll(patternHashTag, " $1"); // now replace the remaining patterns by spaces StringBuilder sb = new StringBuilder(text); Matcher m = patternUrl.matcher(text); while (m.find()) { int start = m.start(); int end = m.end(); sb.replace(start, end, nSpaces(end - start)); } m = patternUser.matcher(text); while (m.find()) { int start = m.start(); int end = m.end(); sb.replace(start, end, nSpaces(end - start)); } text = sb.toString(); logger.debug("Text after cleaning: >>" + text + "<<"); } TagMeAnnotation[] tagmeAnnotations = getTagMeAnnotations(text); for (TagMeAnnotation tagmeAnn : tagmeAnnotations) { if (tagmeAnn.rho >= minrho) { FeatureMap fm = Factory.newFeatureMap(); fm.put("tagMeId", tagmeAnn.id); fm.put("title", tagmeAnn.title); fm.put("rho", tagmeAnn.rho); fm.put("spot", tagmeAnn.spot); fm.put("link_probability", tagmeAnn.link_probability); if (tagmeAnn.title == null) { throw new GateRuntimeException("Odd: got a null title from the TagMe service" + tagmeAnn); } else { fm.put("inst", "http://dbpedia.org/resource/" + recodeForDbp38(tagmeAnn.title)); } try { gate.Utils.addAnn( outputAS, from + tagmeAnn.start, from + tagmeAnn.end, getOutputAnnotationType(), fm); } catch (Exception ex) { System.err.println( "Got an exception in document " + doc.getName() + ": " + ex.getLocalizedMessage()); ex.printStackTrace(System.err); System.err.println("from=" + from + ", to=" + to + " TagMeAnn=" + tagmeAnn); } } } }