Java Token.setEnd Exemples

Langage de programmation: Java

Espace de nommage/Pack: de.unihd.dbs.uima.types.heideltime

Class/Type: Token

Méthode/Fonction: setEnd

Exemples au hotexamples.com: 2

Java Token.setEnd - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de de.unihd.dbs.uima.types.heideltime.Token.setEnd extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

addToIndexes(3)

getBegin(3)

getEnd(3)

getCoveredText(2)

getPos(2)

setBegin(2)

setEnd(2)

setPos(2)

removeFromIndexes(1)

Méthodes fréquemment utilisées

addToIndexes (3)

getBegin (3)

getEnd (3)

getCoveredText (2)

getPos (2)

setBegin (2)

setEnd (2)

setPos (2)

removeFromIndexes (1)

Exemple #1

0

Afficher le fichier

Fichier : TreeTaggerWrapper.java Projet : qwaider/heideltime

/** * tokenizes a given JCas object's document text using the treetagger program and adds the * recognized tokens to the JCas object. * * @param jcas JCas object supplied by the pipeline */ private void tokenize(JCas jcas) { // read tokenized text to add tokens to the jcas Logger.printDetail(component, "TreeTagger (tokenization) with: " + ttprops.abbFileName); EnumSet<Flag> flags = Flag.getSet(ttprops.languageSwitch); TreeTaggerTokenizer ttt; ttprops.abbFileName = "english-abbreviations"; if (ttprops.abbFileName != null) { ttt = new TreeTaggerTokenizer( ttprops.rootPath + ttprops.fileSeparator + "lib" + ttprops.fileSeparator + ttprops.abbFileName, flags); } else { ttt = new TreeTaggerTokenizer(null, flags); } String docText = jcas.getDocumentText().replaceAll("\n\n", "\nEMPTYLINE\n"); List<String> tokenized = ttt.tokenize(docText); int tokenOffset = 0; // loop through all the lines in the treetagger output for (String s : tokenized) { // charset missmatch fallback: signal (invalid) s if ((!(s.equals("EMPTYLINE"))) && (jcas.getDocumentText().indexOf(s, tokenOffset) < 0)) throw new RuntimeException( "Opps! Could not find token " + s + " in JCas after tokenizing with TreeTagger." + " Hmm, there may exist a charset missmatch!" + " Default encoding is " + Charset.defaultCharset().name() + " and should always be UTF-8 (use -Dfile.encoding=UTF-8)." + " If input document is not UTF-8 use -e option to set it according to the input, additionally."); // create tokens and add them to the jcas's indexes. Token newToken = new Token(jcas); if (s.equals("EMPTYLINE")) { newToken.setBegin(tokenOffset); newToken.setEnd(tokenOffset); newToken.setPos("EMPTYLINE"); if (annotate_partofspeech) { newToken.addToIndexes(); } } else { newToken.setBegin(jcas.getDocumentText().indexOf(s, tokenOffset)); newToken.setEnd(newToken.getBegin() + s.length()); newToken.addToIndexes(); tokenOffset = newToken.getEnd(); } } }

Exemple #2

0

Afficher le fichier

Fichier : TreeTaggerWrapper.java Projet : qwaider/heideltime

/** * tokenizes a given JCas object's document text using the chinese tokenization script and adds * the recognized tokens to the JCas object. * * @param jcas JCas object supplied by the pipeline */ private void tokenizeChinese(JCas jcas) { try { // read tokenized text to add tokens to the jcas Process proc = ttprops.getChineseTokenizationProcess(); Logger.printDetail(component, "Chinese tokenization: " + ttprops.chineseTokenizerPath); BufferedReader in = new BufferedReader(new InputStreamReader(proc.getInputStream(), "UTF-8")); BufferedWriter out = new BufferedWriter(new OutputStreamWriter(proc.getOutputStream(), "UTF-8")); Integer tokenOffset = 0; // loop through all the lines in the stdout output String[] inSplits = jcas.getDocumentText().split("[\\r\\n]+"); for (String inSplit : inSplits) { out.write(inSplit); out.newLine(); out.flush(); // do one initial read String s = in.readLine(); do { // break out of the loop if we've read a null if (s == null) break; String[] outSplits = s.split("\\s+"); for (String tok : outSplits) { if (jcas.getDocumentText().indexOf(tok, tokenOffset) < 0) throw new RuntimeException( "Could not find token " + tok + " in JCas after tokenizing with Chinese tokenization script."); // create tokens and add them to the jcas's indexes. Token newToken = new Token(jcas); newToken.setBegin(jcas.getDocumentText().indexOf(tok, tokenOffset)); newToken.setEnd(newToken.getBegin() + tok.length()); newToken.addToIndexes(); tokenOffset = newToken.getEnd(); } // break out of the loop if the next read will block if (!in.ready()) break; s = in.readLine(); } while (true); } // clean up in.close(); proc.destroy(); } catch (Exception e) { e.printStackTrace(); } }