// look up and apply coarts for w to each sign in result @SuppressWarnings("unchecked") private void applyCoarts(Word w, SignHash result) throws LexException { List<Sign> inputSigns = new ArrayList<Sign>(result.asSignSet()); result.clear(); List<Sign> outputSigns = new ArrayList<Sign>(inputSigns.size()); // for each surface attr, lookup coarts and apply to input signs, storing results in output // signs for (Iterator<Pair<String, String>> it = w.getSurfaceAttrValPairs(); it.hasNext(); ) { Pair<String, String> p = it.next(); String attr = (String) p.a; if (!_indexedCoartAttrs.contains(attr)) continue; String val = (String) p.b; Word coartWord = Word.createWord(attr, val); SignHash coartResult = getSignsFromWord(coartWord, null, null, null); for (Iterator<Sign> it2 = coartResult.iterator(); it2.hasNext(); ) { Sign coartSign = it2.next(); // apply to each input for (int j = 0; j < inputSigns.size(); j++) { Sign sign = inputSigns.get(j); grammar.rules.applyCoart(sign, coartSign, outputSigns); } } // switch output to input for next iteration inputSigns.clear(); inputSigns.addAll(outputSigns); outputSigns.clear(); } // add results back result.addAll(inputSigns); }
// get signs with additional args for a known special token const, target pred and target rel private SignHash getSignsFromWord( Word w, String specialTokenConst, String targetPred, String targetRel) throws LexException { Collection<MorphItem> morphItems = (specialTokenConst == null) ? (Collection<MorphItem>) _words.get(w) : null; if (morphItems == null) { // check for special tokens if (specialTokenConst == null) { specialTokenConst = tokenizer.getSpecialTokenConstant(tokenizer.isSpecialToken(w.getForm())); targetPred = w.getForm(); } if (specialTokenConst != null) { Word key = Word.createSurfaceWord(w, specialTokenConst); morphItems = (Collection<MorphItem>) _words.get(key); } // otherwise throw lex exception if (morphItems == null) throw new LexException(w + " not in lexicon"); } SignHash result = new SignHash(); for (Iterator<MorphItem> MI = morphItems.iterator(); MI.hasNext(); ) { getWithMorphItem(w, MI.next(), targetPred, targetRel, result); } return result; }
/** * overhaulSentence is the main method of this class, that will search all words, and attempt to * swap the words that have a part of speech that we are swapping. * * @param sentence * @param partsSearching * @return new sentence with words replaced */ public ArrayList<Word> overhaulSentence( ArrayList<Word> sentence, ArrayList<String> partsSearching) { int index = 0; String k; for (Word w : sentence) // iterate through sentence, check words, set newVals to synonyms { for (String part : partsSearching) { if (w.getPOS().equals(part)) { k = findSynonym(sentence, index); if (w.getNewLength() <= w.getOrigLength()) w.setNewValue(null); } } index++; } return sentence; }
/** * For a given word, return all of its surface word's lexical entries. If the word is not listed * in the lexicon, the tokenizer is consulted to see if it is a special token (date, time, etc.); * otherwise an exception is thrown. If the word has coarticulations, all applicable * coarticulation entries are applied to the base word, in an arbitrary order. * * @param w the word * @return a sign hash * @exception LexException thrown if word not found */ public SignHash getSignsFromWord(Word w) throws LexException { // reduce word to its core, removing coart attrs if any Word surfaceWord = Word.createSurfaceWord(w); Word coreWord = (surfaceWord.attrsIntersect(_coartAttrs)) ? Word.createCoreSurfaceWord(surfaceWord, _coartAttrs) : surfaceWord; // lookup core word SignHash result = getSignsFromWord(coreWord, null, null, null); if (result.size() == 0) { throw new LexException(coreWord + " not found in lexicon"); } // return signs if no coart attrs if (coreWord == surfaceWord) return result; // otherwise apply coarts for word applyCoarts(surfaceWord, result); return result; }
public Word copy() { Word w = new Word(); w.text = text; w.wordnum = wordnum; w.word = word; w.pattern = pattern; // w.score = score; w.enabled = enabled; w.candidates = new Candidate[candidates.length]; w.numUniqueLetters = numUniqueLetters; w.uniqueletters = uniqueletters; for (int i = 0; i < candidates.length; i++) w.candidates[i] = candidates[i]; return w; }
// get signs using an additional arg for a target rel private Collection<Sign> getSignsFromPredAndTargetRel(String pred, String targetRel) { Collection<Word> words = (Collection<Word>) _predToWords.get(pred); String specialTokenConst = null; // for robustness, when using supertagger, add words for pred sans sense index int dotIndex = -1; if (_supertagger != null && !Character.isDigit(pred.charAt(0)) && // skip numbers (dotIndex = pred.lastIndexOf('.')) > 0 && pred.length() > dotIndex + 1 && pred.charAt(dotIndex + 1) != '_') // skip titles, eg Mr._Smith { String barePred = pred.substring(0, dotIndex); Collection<Word> barePredWords = (Collection<Word>) _predToWords.get(barePred); if (words == null) words = barePredWords; else if (barePredWords != null) { Set<Word> unionWords = new HashSet<Word>(words); unionWords.addAll(barePredWords); words = unionWords; } } if (words == null) { specialTokenConst = tokenizer.getSpecialTokenConstant(tokenizer.isSpecialToken(pred)); if (specialTokenConst == null) return null; // lookup words with pred = special token const Collection<Word> specialTokenWords = (Collection<Word>) _predToWords.get(specialTokenConst); // replace special token const with pred if (specialTokenWords == null) return null; words = new ArrayList<Word>(specialTokenWords.size()); for (Iterator<Word> it = specialTokenWords.iterator(); it.hasNext(); ) { Word stw = it.next(); Word w = Word.createSurfaceWord(stw, pred); words.add(w); } } List<Sign> retval = new ArrayList<Sign>(); for (Iterator<Word> it = words.iterator(); it.hasNext(); ) { Word w = it.next(); try { SignHash signs = getSignsFromWord(w, specialTokenConst, pred, targetRel); retval.addAll(signs.asSignSet()); } // shouldn't happen catch (LexException exc) { System.err.println("Unexpected lex exception for word " + w + ": " + exc); } } return retval; }
public PalindromeTest() { ArrayList<String> words = new ArrayList<String>(); ArrayList<String> palis; try { for (String line : Files.readAllLines(Paths.get("com/jsanders/web2"))) { words.add(line); } } catch (IOException ex) { System.out.println("IO error"); System.exit(1); } palis = Palindrome.palindromes(words); assertEquals(palis.size(), 161, 0.01); int shortest = Word.shortestLength(words); assertEquals(shortest, 1, 0.01); int longest = Word.longestLength(words); assertEquals(longest, 24, 0.01); ArrayList<String> shortestWords = Word.shortestWords(words); assertEquals(shortestWords.size(), 52, 0.01); ArrayList<String> longestWords = Word.longestWords(words); assertEquals(longestWords.size(), 5, 0.01); int totalWords = Word.totalWords(words); double avgLen = Word.averageLength(words); assertEquals(totalWords, 235886, 0.01); assertEquals(avgLen, 9.56, 0.01); ArrayList<Double> letterFreq = Word.letterFrequency(words); assertEquals(letterFreq.get(0), 0.087, 0.01); double properFreq = Word.properFrequency(words); assertEquals(properFreq, 0.106, 0.01); ArrayList<Integer> startFreq = Word.startFrequency(words); assertEquals(startFreq.get(0), 17096, 0.01); ArrayList<String> sameStartEnd = Word.startEndWords(words); assertEquals(sameStartEnd.size(), 11505, 0.01); try { PrintWriter f = new PrintWriter("short.txt"); for (String w : shortestWords) f.println(w); f.close(); f = new PrintWriter("long.txt"); for (String w : longestWords) f.println(w); f.close(); f = new PrintWriter("same.txt"); for (String w : sameStartEnd) f.println(w); f.close(); f = new PrintWriter("statistics.txt"); f.println("avg word len: " + avgLen); f.println("freq of letters: " + letterFreq); f.println("freq of proper nouns/names: " + properFreq); f.println("words that start with each letter:: " + startFreq); f.close(); } catch (IOException ex) { System.out.println("IO error"); System.exit(1); } }
// given EntriesItem private void getWithEntriesItem( Word w, MorphItem mi, String stem, String pred, String targetPred, String targetRel, EntriesItem item, MacroAdder macAdder, Map<String, Double> supertags, Set<String> supertagsFound, SignHash result) { // ensure apropos if (targetPred != null && !targetPred.equals(pred)) return; if (targetRel != null && !targetRel.equals(item.getIndexRel()) && !targetRel.equals(item.getCoartRel())) return; if (!item.getActive().booleanValue()) return; if (mi.excluded(item)) return; try { // copy and add macros Category cat = item.getCat().copy(); macAdder.addMacros(cat); // replace DEFAULT_VAL with pred, after first // unifying type of associated nom var(s) with sem class unifySemClass(cat, mi.getWord().getSemClass()); REPLACEMENT = pred; cat.deepMap(defaultReplacer); // check supertag // TODO: think about earlier checks for efficiency, for grammars where macros and preds don't // matter // Double lexprob = null; // nb: skipping lex log probs, don't seem to be helpful if (supertags != null) { // skip if not found String stag = cat.getSupertag(); if (!supertags.containsKey(stag)) return; // otherwise update found supertags supertagsFound.add(stag); // get lex prob // lexprob = supertags.get(stag); } // propagate types of nom vars propagateTypes(cat); // handle distrib attrs and inherits-from propagateDistributiveAttrs(cat); expandInheritsFrom(cat); // merge stem, pos, sem class from morph item, plus supertag from cat Word word = Word.createFullWord(w, mi.getWord(), cat.getSupertag()); // set origin and lexprob Sign sign = new Sign(word, cat); sign.setOrigin(); // if (lexprob != null) { // sign.addData(new SupertaggerAdapter.LexLogProb((float) Math.log10(lexprob))); // } // return sign result.insert(sign); } catch (RuntimeException exc) { System.err.println( "Warning: ignoring entry: " + item.getName() + " of family: " + item.getFamilyName() + " for stem: " + stem + " b/c: " + exc.toString()); } }
/** Loads the lexicon and morph files. */ public void init(URL lexiconUrl, URL morphUrl) throws IOException { List<Family> lexicon = null; List<MorphItem> morph = null; List<MacroItem> macroModel = null; // load category families (lexicon), morph forms and macros lexicon = getLexicon(lexiconUrl); Pair<List<MorphItem>, List<MacroItem>> morphInfo = getMorph(morphUrl); morph = morphInfo.a; macroModel = morphInfo.b; // index words; also index stems to words, as default preds // store indexed coarticulation attrs too _words = new GroupMap<Word, MorphItem>(); _predToWords = new GroupMap<String, Word>(); _coartAttrs = new HashSet<String>(); _indexedCoartAttrs = new HashSet<String>(); for (MorphItem morphItem : morph) { Word surfaceWord = morphItem.getSurfaceWord(); _words.put(surfaceWord, morphItem); _predToWords.put(morphItem.getWord().getStem(), surfaceWord); if (morphItem.isCoart()) { Word indexingWord = morphItem.getCoartIndexingWord(); _words.put(indexingWord, morphItem); Pair<String, String> first = indexingWord.getSurfaceAttrValPairs().next(); _indexedCoartAttrs.add(first.a); for (Iterator<Pair<String, String>> it = surfaceWord.getSurfaceAttrValPairs(); it.hasNext(); ) { Pair<String, String> p = it.next(); _coartAttrs.add(p.a); } } } // index entries based on stem+pos _stems = new GroupMap<String, Object>(); _posToEntries = new GroupMap<String, EntriesItem[]>(); // index entries by supertag+pos, for supertagging _stagToEntries = new GroupMap<String, EntriesItem>(); // also index rels and coart rels to preds _relsToPreds = new GroupMap<String, String>(); _coartRelsToPreds = new GroupMap<String, String>(); // and gather list of attributes used per atomic category type _catsToAttrs = new GroupMap<String, String>(); _lfAttrs = new HashSet<String>(); // and remember family and ent, names, for checking excluded list on morph items HashSet<String> familyAndEntryNames = new HashSet<String>(); // index each family for (Family family : lexicon) { familyAndEntryNames.add(family.getName()); EntriesItem[] entries = family.getEntries(); DataItem[] data = family.getData(); // for generic use when we get an unknown stem // from the morphological analyzer if (!family.isClosed()) { _posToEntries.put(family.getPOS(), entries); } // scan through entries for (int j = 0; j < entries.length; j++) { // index EntriesItem eItem = entries[j]; _stagToEntries.put(eItem.getSupertag() + family.getPOS(), eItem); if (eItem.getStem().length() > 0) { _stems.put(eItem.getStem() + family.getPOS(), eItem); } try { // gather features eItem.getCat().forall(gatherAttrs); // record names familyAndEntryNames.add(eItem.getName()); familyAndEntryNames.add(eItem.getQualifiedName()); } catch (RuntimeException exc) { System.err.println("exception for: " + family.getName() + ": " + exc); } } // scan through data for (int j = 0; j < data.length; j++) { DataItem dItem = data[j]; _stems.put( dItem.getStem() + family.getPOS(), new Pair<DataItem, EntriesItem[]>(dItem, entries)); // index non-default preds to words if (!dItem.getStem().equals(dItem.getPred())) { Collection<Word> words = (Collection<Word>) _predToWords.get(dItem.getStem()); if (words == null) { if (!openlex) { System.out.print("Warning: couldn't find words for pred '"); System.out.println(dItem.getPred() + "' with stem '" + dItem.getStem() + "'"); } } else { for (Iterator<Word> it = words.iterator(); it.hasNext(); ) { _predToWords.put(dItem.getPred(), it.next()); } } } } // index rels to preds // nb: this covers relational (eg @x<GenRel>e) and featural (eg @e<tense>past) // elementary predications List<String> indexRels = new ArrayList<String>(3); String familyIndexRel = family.getIndexRel(); if (familyIndexRel.length() > 0) { indexRels.add(familyIndexRel); } for (int j = 0; j < entries.length; j++) { EntriesItem eItem = entries[j]; String indexRel = eItem.getIndexRel(); if (indexRel.length() > 0 && !indexRel.equals(familyIndexRel)) { indexRels.add(indexRel); } } for (Iterator<String> it = indexRels.iterator(); it.hasNext(); ) { String indexRel = it.next(); // nb: not indexing on entries items, b/c some stems are still defaults for (int j = 0; j < data.length; j++) { DataItem dItem = data[j]; _relsToPreds.put(indexRel, dItem.getPred()); } } // index coart rels (features, really) to preds String coartRel = family.getCoartRel(); if (coartRel.length() > 0) { for (int j = 0; j < data.length; j++) { _coartRelsToPreds.put(coartRel, data[j].getPred()); } } } // index the macros _macros = new GroupMap<String, FeatureStructure>(); // nb: could just index MacroItem objects for feature structures too; // this might be a bit cleaner, but life is short _macroItems = new HashMap<String, MacroItem>(); for (MacroItem mi : macroModel) { String macName = mi.getName(); FeatureStructure[] specs = mi.getFeatureStructures(); for (int j = 0; j < specs.length; j++) { _macros.put(macName, specs[j]); } // this is for handling LF part of macros _macroItems.put(macName, mi); } // with morph items, check POS, macro names, excluded list for xref for (MorphItem morphItem : morph) { Word w = morphItem.getWord(); if (!openlex && !_stems.containsKey(w.getStem() + w.getPOS()) && !_posToEntries.containsKey(w.getPOS())) { System.err.println( "Warning: no entries for stem '" + w.getStem() + "' and POS '" + w.getPOS() + "' found for word '" + w + "'"); } String[] macroNames = morphItem.getMacros(); for (int j = 0; j < macroNames.length; j++) { if (!_macroItems.containsKey(macroNames[j])) { System.err.println( "Warning: macro " + macroNames[j] + " not found for word '" + morphItem.getWord() + "'"); } } String[] excludedNames = morphItem.getExcluded(); for (int j = 0; j < excludedNames.length; j++) { if (!familyAndEntryNames.contains(excludedNames[j])) { System.err.println( "Warning: excluded family or entry '" + excludedNames[j] + "' not found for word '" + morphItem.getWord() + "'"); } } } }
/** * Main method drives all methods * * @param args * @throws IOException */ public static void main(String[] args) throws IOException { String usageError = "Please provide a valid option. Such as: " + "\n -add FILENAME *creates new HITs from the data provided in the given file(s)* " + "\n -delete FILENAME *deletes all of the HITs with IDs matching those given in the file(s)*" + "\n -approveAll FILENAME *approves all the assignments for all HITs with IDs in the given file(s)*"; if (args.length >= 1) { // Create an instance of this class. LexicalSubSurvey app = new LexicalSubSurvey(); File inputFile = null; try { if (args.length > 1) inputFile = new File(args[1]); if (args[0].equals("-add")) { // When -add tag is given in adds HITs to Mechanical turk depending on the URL in the // mturk.properties file String[] parts = { "NN", "NNS", "JJ", "JJR", "JJS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ" }; ArrayList<String> pos = new ArrayList<String>(); for (int i = 0; i < parts.length; i++) { pos.add(parts[i]); } ExamplePairReader reader = new ExamplePairReader(PARSED, ALIGN); BufferedReader in = new BufferedReader( new InputStreamReader( new FileInputStream(inputFile))); // typical file name: "sub.simple.first100" DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-ddHH:mm:ss"); Date date = new Date(); // The three different experiments leave one uncommented at a time to do single groupings app.contextpr = new PrintWriter( new FileOutputStream( new File(inputFile.getName() + "ContextGivenIDs" + dateFormat.format(date)))); app.partialContextpr = new PrintWriter( new FileOutputStream( new File( inputFile.getName() + "partialContextIDs" + dateFormat.format(date)))); app.noContextpr = new PrintWriter( new FileOutputStream( new File( inputFile.getName() + "NoContextGivenIDs" + dateFormat.format(date)))); Map<String, String> codeToPOS = new HashMap<String, String>(14); codeToPOS.put("NN", "Noun"); codeToPOS.put("NNS", "Noun"); codeToPOS.put("JJ", "Adjective"); codeToPOS.put("JJR", "Adjective"); codeToPOS.put("JJS", "Adjective"); codeToPOS.put("RB", "Adverb"); codeToPOS.put("RBR", "Adverb"); codeToPOS.put("RBS", "Adverb"); codeToPOS.put("VB", "Verb"); codeToPOS.put("VBD", "Verb"); codeToPOS.put("VBG", "Verb"); codeToPOS.put("VBN", "Verb"); codeToPOS.put("VBP", "Verb"); codeToPOS.put("VBZ", "Verb"); String input = in.readLine(); Map<String, String[]> wordToSense = new HashMap<String, String[]>(25); String focusWord = ""; String sense = ""; String context = ""; String simpleWord; while (input != null) { StringTokenizer splitter = new StringTokenizer(input, "\t"); context = splitter.nextToken(); splitter.nextToken(); focusWord = splitter.nextToken(); simpleWord = splitter.nextToken(); sense = splitter.nextToken(); String[] wordAssociations = {context, sense, simpleWord}; wordToSense.put(focusWord, wordAssociations); input = in.readLine(); } for (int k = 0; k < 1000000 && reader.hasNext(); k++) { // for counted input goes through until reaches end or max number ExamplePair p = reader.next(); Alignment align = p.getAlignment(); ArrayList<Word> normalWords = p.getNormal().getWords(); ArrayList<Word> simpleWords = p.getSimple().getWords(); // creates object = list of simple words SimpleWordsList simpleWordsList = new SimpleWordsList(); for (AlignPair pair : align) { int n = pair.getNormalIndex(); int s = pair.getSimpleIndex(); Word normal = normalWords.get(n); Word simple = simpleWords.get(s); boolean diffWords = !normal.getWord().toLowerCase().equals(simple.getWord().toLowerCase()); boolean normWordSimplePOS = pos.contains(normal.getPos()); boolean posEqual = normal.getPos().equals(simple.getPos()); boolean normalIsAlreadySimple = simpleWordsList.contains(normal.getWord()); boolean doWeHaveSense = wordToSense.containsKey(normal.getWord()); if (doWeHaveSense) context = wordToSense.get(normal.getWord())[0]; boolean contextMatch = context.equals(p.getNormal().textString()); if (diffWords && normWordSimplePOS && posEqual && !normalIsAlreadySimple && doWeHaveSense && contextMatch) { String firstPart = ""; String partialFirst = ""; String wordAfterFocus = normalWords.get(n + 1).getWord(); String target = normal.getWord(); if (!(wordAfterFocus.length() == 1 && wordAfterFocus.compareTo("A") < 0)) { target += " "; } String secondPart = ""; String partialSecond = ""; sense = wordToSense.get(normal.getWord())[1]; String POS = codeToPOS.get(normal.getPos()); for (int i = 0; i < normalWords.size(); i++) { String currentWord = normalWords.get(i).getWord(); String nextWord = ""; if (i + 1 < normalWords.size()) { nextWord = normalWords.get(i + 1).getWord(); } if (i < n) { if (i > n - 3) partialFirst += currentWord; firstPart += currentWord; if (!(nextWord.length() == 1 && nextWord.compareTo("A") < 0)) { firstPart += " "; if (i > n - 3) partialFirst += " "; } } if (i > n) { if (i < n + 3) partialSecond += currentWord; secondPart += currentWord; if (!(nextWord.length() == 1 && nextWord.compareTo("A") < 0)) { secondPart += " "; if (i < n + 3) partialSecond += " "; } } } // comment out 2 out of the 3 for single grouping app.createContextGivenSurvey(firstPart, target, secondPart); app.createPartialContextGivenSurvey( partialFirst, target, partialSecond, sense, POS); app.createNoContextGivenSurvey(target, sense, POS); } } } // comment out 2 for single grouping app.contextpr.close(); app.partialContextpr.close(); app.noContextpr.close(); } else if (args[0].equals("-delete")) { // deletes the hits whose IDs are in the given file System.out.println("deleting"); // IDs are usually stored in these files: NoContextGivenIDs, NoTargetGivenIDs, // ContextGivenIDs BufferedReader fileReader = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile))); String hitId = ""; for (hitId = fileReader.readLine(); hitId != null; hitId = fileReader.readLine()) { System.out.println(hitId); app.deleteHIT(hitId); } } else if (args[0].equals( "-approveAll")) { // approves all submissions for all hits whose IDs in the given file System.out.println("approving"); // IDs are usually stored in these files: NoContextGivenIDs, NoTargetGivenIDs, // ContextGivenIDs BufferedReader fileReader = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile))); String hitId = ""; for (hitId = fileReader.readLine(); hitId != null; hitId = fileReader.readLine()) { System.out.println(hitId); app.approveHIT(hitId); } } else { System.err.println("No valid options were provided"); System.out.println(usageError); } } catch (IOException e) { System.err.println("Could not find the file: \"" + args[1] + "\""); System.err.println("Please provide a valid file name"); } } else System.out.println(usageError); }
public static void main(String[] args) throws Exception { String[] args2 = new String[3]; args2[0] = "/Users/EsferaDePandora/Copy/2015/CICLO 2/CC2003 ALGORITMOS Y ESTRUCTURAS DE DATOS/Hojas de trabajo/HT09 - BST y MAPEO/HT9 proyecto/src/words.txt"; args2[1] = "/Users/EsferaDePandora/Copy/2015/CICLO 2/CC2003 ALGORITMOS Y ESTRUCTURAS DE DATOS/Hojas de trabajo/HT09 - BST y MAPEO/HT9 proyecto/src/text.txt"; args2[2] = "5"; if (args2.length > 1) { File wordFile = new File(args2[0]); File textFile = new File(args2[1]); /////////////////////////////////////////////////////////////////// // 1 SimpleSet // 2 Red Black Tree // 3 Splay Tree // 4 Hash Table // 5 TreeMap (de java collection framework) /////////////////////////////////////////////////////////////////// int implementacion = Integer.parseInt(args2[2]); BufferedReader wordreader; BufferedReader textreader; int verbs = 0; int nouns = 0; int adjectives = 0; int adverbs = 0; int gerunds = 0; long starttime; long endtime; // VERIFICA QUE PARAMETROS EXISTEN if (wordFile.isFile() && textFile.isFile()) { // Leer archivos try { wordreader = new BufferedReader(new FileReader(wordFile)); textreader = new BufferedReader(new FileReader(textFile)); } catch (Exception ex) { System.out.println("Error al leer!"); return; } /////////////////////////////////////////////////////////////////// WordSet words = WordSetFactory.generateSet(implementacion); /////////////////////////////////////////////////////////////////// String line = null; String[] wordParts; // LEE ARCHIVO starttime = System.currentTimeMillis(); line = wordreader.readLine(); while (line != null) { wordParts = line.split("\\."); // lo que esta entre comillas es una expresi�n regular. if (wordParts.length == 2) { words.add(new Word(wordParts[0].trim(), wordParts[1].trim())); } line = wordreader.readLine(); } wordreader.close(); endtime = System.currentTimeMillis(); System.out.println("Palabras cargadas en " + (endtime - starttime) + " ms."); starttime = System.currentTimeMillis(); line = textreader.readLine(); String[] textParts; Word currentword; Word lookupword = new Word(); while (line != null) { // SEPARA PALABRAS textParts = line.split("[^\\w-]+"); // VERIFICA EL TIPO for (int i = 0; i < textParts.length; i++) { lookupword.setWord(textParts[i].trim().toLowerCase()); currentword = words.get(lookupword); if (currentword != null) { if (currentword.getType().equals("v-d") || currentword.getType().equals("v") || currentword.getType().equals("q")) verbs++; else if (currentword.getType().equals("g")) gerunds++; else if (currentword.getType().equals("a-s") || currentword.getType().equals("a-c") || currentword.getType().equals("a")) adjectives++; else if (currentword.getType().equals("e")) adverbs++; else nouns++; } } line = textreader.readLine(); } textreader.close(); endtime = System.currentTimeMillis(); System.out.println("Texto analizado en " + (endtime - starttime) + " ms."); /////////////////////////////////////////////////////////////////// System.out.println("El texto tiene:"); System.out.println(verbs + " verbos"); System.out.println(nouns + " sustantivos"); System.out.println(adjectives + " adjetivos"); System.out.println(adverbs + " adverbios"); System.out.println(gerunds + " gerundios"); /////////////////////////////////////////////////////////////////// } else { System.out.println("No encuentro los archivos :'( "); } } else { System.out.println("Faltan Parametros."); } }
protected void out(Word w) { out(w.getLangId() + ": " + w.getSpelling() + " (" + w.getSign() + ")"); }
/* public methods */ public DirectedGraph parse() { /* add the parts of speech */ addPOS(); /* update all node to include parts of speech */ System.out.println("Parsing file for parts of speech analysis..."); byte[] buffer = new byte[100000]; // 100 kb try { /* retrieve the data from the file */ FileInputStream fin = new FileInputStream(file); fin.read(buffer); fin.close(); } catch (Exception e) { System.out.println("IO Error: " + e); } /* transfer data to a string */ String data = new String(buffer); /* create a tokenizer to parse the data */ StringTokenizer st = new StringTokenizer( data, " :;\"\n\t\r_,.!?`\u2015\u2012\u2014\u2013\u2212"); // unicode for dash /* temporary variables */ String pos = ""; String wordString = ""; Word word = new Word(""); Node node = new Node(word); while (st.hasMoreTokens()) { /* take care of extraneous hiphens */ String test = st.nextToken(); while (test.equals("-")) { test = st.nextToken(); } while (test.equals("--")) { test = st.nextToken(); } /* put the string to lowercase */ wordString = test; wordString = wordString.toLowerCase(); /* get the POS */ if (st.hasMoreTokens()) { pos = st.nextToken(); } /* if we have the possessive case */ if (wordString.equals("'s")) { /* create a node object from the previous iteration */ Word possessiveWord = new Word(word.toString() + "'s"); node = new Node(possessiveWord); /* get the position of the node in the graph */ int index = result.findIndex(node); /* add the possessive node to the graph */ if (index >= 0) { /* make sure we get all the associations */ node = result.nodeAt(index); /* transfer the part of speech */ node.getWord().setPartOfSpeech(word.getPartOfSpeech()); /* add the possessive quality */ node.getWord().setPossessive(); /* insert the node into our array at the right position */ (result.getNodes())[index] = node; } } else { word = new Word(wordString); /* lots of if statements */ if (pos.equals("AFX")) { word.setPartOfSpeech(1); } else if (pos.equals("CC")) { word.setPartOfSpeech(2); } else if (pos.equals("CD")) { word.setPartOfSpeech(3); } else if (pos.equals("DT")) { word.setPartOfSpeech(4); } else if (pos.equals("EX")) { word.setPartOfSpeech(5); } else if (pos.equals("FW")) { word.setPartOfSpeech(6); } else if (pos.equals("IN")) { word.setPartOfSpeech(7); } else if (pos.equals("JJ")) { word.setPartOfSpeech(8); } else if (pos.equals("JJR")) { word.setPartOfSpeech(9); } else if (pos.equals("JJS")) { word.setPartOfSpeech(10); } else if (pos.equals("LS")) { word.setPartOfSpeech(11); } else if (pos.equals("MD")) { word.setPartOfSpeech(12); } else if (pos.equals("NN")) { word.setPartOfSpeech(13); } else if (pos.equals("NNP")) { word.setPartOfSpeech(14); } else if (pos.equals("NNPS")) { word.setPartOfSpeech(15); } else if (pos.equals("NNS")) { word.setPartOfSpeech(16); } else if (pos.equals("PDT")) { word.setPartOfSpeech(17); } else if (pos.equals("POS")) { word.setPartOfSpeech(18); } else if (pos.equals("PRP")) { word.setPartOfSpeech(19); } else if (pos.equals("PRP$")) { word.setPartOfSpeech(20); } else if (pos.equals("RB")) { word.setPartOfSpeech(21); } else if (pos.equals("RBR")) { word.setPartOfSpeech(22); } else if (pos.equals("RBS")) { word.setPartOfSpeech(23); } else if (pos.equals("RP")) { word.setPartOfSpeech(24); } else if (pos.equals("SYM")) { word.setPartOfSpeech(25); } else if (pos.equals("TO")) { word.setPartOfSpeech(26); } else if (pos.equals("UH")) { word.setPartOfSpeech(27); } else if (pos.equals("VB")) { word.setPartOfSpeech(28); } else if (pos.equals("VBD")) { word.setPartOfSpeech(29); } else if (pos.equals("VBG")) { word.setPartOfSpeech(30); } else if (pos.equals("VBN")) { word.setPartOfSpeech(31); } else if (pos.equals("VBP")) { word.setPartOfSpeech(32); } else if (pos.equals("VBZ")) { word.setPartOfSpeech(33); } else if (pos.equals("WDT")) { word.setPartOfSpeech(34); } else if (pos.equals("WP")) { word.setPartOfSpeech(35); } else if (pos.equals("WPS")) { word.setPartOfSpeech(36); } else if (pos.equals("WRB")) { word.setPartOfSpeech(37); } node = new Node(word); int index = result.findIndex(node); if (index >= 0) { /* make sure we get all the associations */ node = result.nodeAt(index); /* transfer the part of speech */ node.getWord().setPartOfSpeech(word.getPartOfSpeech()); /* insert the node into our array at the right position */ (result.getNodes())[index] = node; } } } // end while return result; }