예제 #1
0
 // look up and apply coarts for w to each sign in result
 @SuppressWarnings("unchecked")
 private void applyCoarts(Word w, SignHash result) throws LexException {
   List<Sign> inputSigns = new ArrayList<Sign>(result.asSignSet());
   result.clear();
   List<Sign> outputSigns = new ArrayList<Sign>(inputSigns.size());
   // for each surface attr, lookup coarts and apply to input signs, storing results in output
   // signs
   for (Iterator<Pair<String, String>> it = w.getSurfaceAttrValPairs(); it.hasNext(); ) {
     Pair<String, String> p = it.next();
     String attr = (String) p.a;
     if (!_indexedCoartAttrs.contains(attr)) continue;
     String val = (String) p.b;
     Word coartWord = Word.createWord(attr, val);
     SignHash coartResult = getSignsFromWord(coartWord, null, null, null);
     for (Iterator<Sign> it2 = coartResult.iterator(); it2.hasNext(); ) {
       Sign coartSign = it2.next();
       // apply to each input
       for (int j = 0; j < inputSigns.size(); j++) {
         Sign sign = inputSigns.get(j);
         grammar.rules.applyCoart(sign, coartSign, outputSigns);
       }
     }
     // switch output to input for next iteration
     inputSigns.clear();
     inputSigns.addAll(outputSigns);
     outputSigns.clear();
   }
   // add results back
   result.addAll(inputSigns);
 }
예제 #2
0
  // get signs with additional args for a known special token const, target pred and target rel
  private SignHash getSignsFromWord(
      Word w, String specialTokenConst, String targetPred, String targetRel) throws LexException {

    Collection<MorphItem> morphItems =
        (specialTokenConst == null) ? (Collection<MorphItem>) _words.get(w) : null;

    if (morphItems == null) {
      // check for special tokens
      if (specialTokenConst == null) {
        specialTokenConst =
            tokenizer.getSpecialTokenConstant(tokenizer.isSpecialToken(w.getForm()));
        targetPred = w.getForm();
      }
      if (specialTokenConst != null) {
        Word key = Word.createSurfaceWord(w, specialTokenConst);
        morphItems = (Collection<MorphItem>) _words.get(key);
      }
      // otherwise throw lex exception
      if (morphItems == null) throw new LexException(w + " not in lexicon");
    }

    SignHash result = new SignHash();

    for (Iterator<MorphItem> MI = morphItems.iterator(); MI.hasNext(); ) {
      getWithMorphItem(w, MI.next(), targetPred, targetRel, result);
    }

    return result;
  }
예제 #3
0
  /**
   * overhaulSentence is the main method of this class, that will search all words, and attempt to
   * swap the words that have a part of speech that we are swapping.
   *
   * @param sentence
   * @param partsSearching
   * @return new sentence with words replaced
   */
  public ArrayList<Word> overhaulSentence(
      ArrayList<Word> sentence, ArrayList<String> partsSearching) {
    int index = 0;
    String k;
    for (Word w : sentence) // iterate through sentence, check words, set newVals to synonyms
    {
      for (String part : partsSearching) {
        if (w.getPOS().equals(part)) {
          k = findSynonym(sentence, index);
          if (w.getNewLength() <= w.getOrigLength()) w.setNewValue(null);
        }
      }
      index++;
    }

    return sentence;
  }
예제 #4
0
 /**
  * For a given word, return all of its surface word's lexical entries. If the word is not listed
  * in the lexicon, the tokenizer is consulted to see if it is a special token (date, time, etc.);
  * otherwise an exception is thrown. If the word has coarticulations, all applicable
  * coarticulation entries are applied to the base word, in an arbitrary order.
  *
  * @param w the word
  * @return a sign hash
  * @exception LexException thrown if word not found
  */
 public SignHash getSignsFromWord(Word w) throws LexException {
   // reduce word to its core, removing coart attrs if any
   Word surfaceWord = Word.createSurfaceWord(w);
   Word coreWord =
       (surfaceWord.attrsIntersect(_coartAttrs))
           ? Word.createCoreSurfaceWord(surfaceWord, _coartAttrs)
           : surfaceWord;
   // lookup core word
   SignHash result = getSignsFromWord(coreWord, null, null, null);
   if (result.size() == 0) {
     throw new LexException(coreWord + " not found in lexicon");
   }
   // return signs if no coart attrs
   if (coreWord == surfaceWord) return result;
   // otherwise apply coarts for word
   applyCoarts(surfaceWord, result);
   return result;
 }
예제 #5
0
  public Word copy() {
    Word w = new Word();

    w.text = text;
    w.wordnum = wordnum;
    w.word = word;
    w.pattern = pattern;
    //	w.score = score;
    w.enabled = enabled;
    w.candidates = new Candidate[candidates.length];
    w.numUniqueLetters = numUniqueLetters;
    w.uniqueletters = uniqueletters;

    for (int i = 0; i < candidates.length; i++) w.candidates[i] = candidates[i];

    return w;
  }
예제 #6
0
  // get signs using an additional arg for a target rel
  private Collection<Sign> getSignsFromPredAndTargetRel(String pred, String targetRel) {

    Collection<Word> words = (Collection<Word>) _predToWords.get(pred);
    String specialTokenConst = null;

    // for robustness, when using supertagger, add words for pred sans sense index
    int dotIndex = -1;
    if (_supertagger != null
        && !Character.isDigit(pred.charAt(0))
        && // skip numbers
        (dotIndex = pred.lastIndexOf('.')) > 0
        && pred.length() > dotIndex + 1
        && pred.charAt(dotIndex + 1) != '_') // skip titles, eg Mr._Smith
    {
      String barePred = pred.substring(0, dotIndex);
      Collection<Word> barePredWords = (Collection<Word>) _predToWords.get(barePred);
      if (words == null) words = barePredWords;
      else if (barePredWords != null) {
        Set<Word> unionWords = new HashSet<Word>(words);
        unionWords.addAll(barePredWords);
        words = unionWords;
      }
    }

    if (words == null) {
      specialTokenConst = tokenizer.getSpecialTokenConstant(tokenizer.isSpecialToken(pred));
      if (specialTokenConst == null) return null;
      // lookup words with pred = special token const
      Collection<Word> specialTokenWords = (Collection<Word>) _predToWords.get(specialTokenConst);
      // replace special token const with pred
      if (specialTokenWords == null) return null;
      words = new ArrayList<Word>(specialTokenWords.size());
      for (Iterator<Word> it = specialTokenWords.iterator(); it.hasNext(); ) {
        Word stw = it.next();
        Word w = Word.createSurfaceWord(stw, pred);
        words.add(w);
      }
    }

    List<Sign> retval = new ArrayList<Sign>();
    for (Iterator<Word> it = words.iterator(); it.hasNext(); ) {
      Word w = it.next();
      try {
        SignHash signs = getSignsFromWord(w, specialTokenConst, pred, targetRel);
        retval.addAll(signs.asSignSet());
      }
      // shouldn't happen
      catch (LexException exc) {
        System.err.println("Unexpected lex exception for word " + w + ": " + exc);
      }
    }
    return retval;
  }
예제 #7
0
  public PalindromeTest() {
    ArrayList<String> words = new ArrayList<String>();
    ArrayList<String> palis;

    try {
      for (String line : Files.readAllLines(Paths.get("com/jsanders/web2"))) {
        words.add(line);
      }
    } catch (IOException ex) {
      System.out.println("IO error");
      System.exit(1);
    }

    palis = Palindrome.palindromes(words);
    assertEquals(palis.size(), 161, 0.01);

    int shortest = Word.shortestLength(words);
    assertEquals(shortest, 1, 0.01);

    int longest = Word.longestLength(words);
    assertEquals(longest, 24, 0.01);

    ArrayList<String> shortestWords = Word.shortestWords(words);
    assertEquals(shortestWords.size(), 52, 0.01);

    ArrayList<String> longestWords = Word.longestWords(words);
    assertEquals(longestWords.size(), 5, 0.01);

    int totalWords = Word.totalWords(words);
    double avgLen = Word.averageLength(words);

    assertEquals(totalWords, 235886, 0.01);
    assertEquals(avgLen, 9.56, 0.01);

    ArrayList<Double> letterFreq = Word.letterFrequency(words);
    assertEquals(letterFreq.get(0), 0.087, 0.01);

    double properFreq = Word.properFrequency(words);
    assertEquals(properFreq, 0.106, 0.01);

    ArrayList<Integer> startFreq = Word.startFrequency(words);
    assertEquals(startFreq.get(0), 17096, 0.01);

    ArrayList<String> sameStartEnd = Word.startEndWords(words);
    assertEquals(sameStartEnd.size(), 11505, 0.01);

    try {
      PrintWriter f = new PrintWriter("short.txt");
      for (String w : shortestWords) f.println(w);
      f.close();

      f = new PrintWriter("long.txt");
      for (String w : longestWords) f.println(w);
      f.close();

      f = new PrintWriter("same.txt");
      for (String w : sameStartEnd) f.println(w);
      f.close();

      f = new PrintWriter("statistics.txt");
      f.println("avg word len: " + avgLen);
      f.println("freq of letters: " + letterFreq);
      f.println("freq of proper nouns/names: " + properFreq);
      f.println("words that start with each letter:: " + startFreq);

      f.close();
    } catch (IOException ex) {
      System.out.println("IO error");
      System.exit(1);
    }
  }
예제 #8
0
  // given EntriesItem
  private void getWithEntriesItem(
      Word w,
      MorphItem mi,
      String stem,
      String pred,
      String targetPred,
      String targetRel,
      EntriesItem item,
      MacroAdder macAdder,
      Map<String, Double> supertags,
      Set<String> supertagsFound,
      SignHash result) {
    // ensure apropos
    if (targetPred != null && !targetPred.equals(pred)) return;
    if (targetRel != null
        && !targetRel.equals(item.getIndexRel())
        && !targetRel.equals(item.getCoartRel())) return;
    if (!item.getActive().booleanValue()) return;
    if (mi.excluded(item)) return;

    try {
      // copy and add macros
      Category cat = item.getCat().copy();
      macAdder.addMacros(cat);

      // replace DEFAULT_VAL with pred, after first
      // unifying type of associated nom var(s) with sem class
      unifySemClass(cat, mi.getWord().getSemClass());
      REPLACEMENT = pred;
      cat.deepMap(defaultReplacer);

      // check supertag
      // TODO: think about earlier checks for efficiency, for grammars where macros and preds don't
      // matter
      // Double lexprob = null; // nb: skipping lex log probs, don't seem to be helpful
      if (supertags != null) {
        // skip if not found
        String stag = cat.getSupertag();
        if (!supertags.containsKey(stag)) return;
        // otherwise update found supertags
        supertagsFound.add(stag);
        // get lex prob
        // lexprob = supertags.get(stag);
      }

      // propagate types of nom vars
      propagateTypes(cat);

      // handle distrib attrs and inherits-from
      propagateDistributiveAttrs(cat);
      expandInheritsFrom(cat);

      // merge stem, pos, sem class from morph item, plus supertag from cat
      Word word = Word.createFullWord(w, mi.getWord(), cat.getSupertag());

      // set origin and lexprob
      Sign sign = new Sign(word, cat);
      sign.setOrigin();
      // if (lexprob != null) {
      //	sign.addData(new SupertaggerAdapter.LexLogProb((float) Math.log10(lexprob)));
      // }
      // return sign
      result.insert(sign);
    } catch (RuntimeException exc) {
      System.err.println(
          "Warning: ignoring entry: "
              + item.getName()
              + " of family: "
              + item.getFamilyName()
              + " for stem: "
              + stem
              + " b/c: "
              + exc.toString());
    }
  }
예제 #9
0
  /** Loads the lexicon and morph files. */
  public void init(URL lexiconUrl, URL morphUrl) throws IOException {

    List<Family> lexicon = null;
    List<MorphItem> morph = null;
    List<MacroItem> macroModel = null;

    // load category families (lexicon), morph forms and macros
    lexicon = getLexicon(lexiconUrl);
    Pair<List<MorphItem>, List<MacroItem>> morphInfo = getMorph(morphUrl);
    morph = morphInfo.a;
    macroModel = morphInfo.b;

    // index words; also index stems to words, as default preds
    // store indexed coarticulation attrs too
    _words = new GroupMap<Word, MorphItem>();
    _predToWords = new GroupMap<String, Word>();
    _coartAttrs = new HashSet<String>();
    _indexedCoartAttrs = new HashSet<String>();
    for (MorphItem morphItem : morph) {
      Word surfaceWord = morphItem.getSurfaceWord();
      _words.put(surfaceWord, morphItem);
      _predToWords.put(morphItem.getWord().getStem(), surfaceWord);
      if (morphItem.isCoart()) {
        Word indexingWord = morphItem.getCoartIndexingWord();
        _words.put(indexingWord, morphItem);
        Pair<String, String> first = indexingWord.getSurfaceAttrValPairs().next();
        _indexedCoartAttrs.add(first.a);
        for (Iterator<Pair<String, String>> it = surfaceWord.getSurfaceAttrValPairs();
            it.hasNext(); ) {
          Pair<String, String> p = it.next();
          _coartAttrs.add(p.a);
        }
      }
    }

    // index entries based on stem+pos
    _stems = new GroupMap<String, Object>();
    _posToEntries = new GroupMap<String, EntriesItem[]>();
    // index entries by supertag+pos, for supertagging
    _stagToEntries = new GroupMap<String, EntriesItem>();
    // also index rels and coart rels to preds
    _relsToPreds = new GroupMap<String, String>();
    _coartRelsToPreds = new GroupMap<String, String>();
    // and gather list of attributes used per atomic category type
    _catsToAttrs = new GroupMap<String, String>();
    _lfAttrs = new HashSet<String>();
    // and remember family and ent, names, for checking excluded list on morph items
    HashSet<String> familyAndEntryNames = new HashSet<String>();

    // index each family
    for (Family family : lexicon) {

      familyAndEntryNames.add(family.getName());
      EntriesItem[] entries = family.getEntries();
      DataItem[] data = family.getData();

      // for generic use when we get an unknown stem
      // from the morphological analyzer
      if (!family.isClosed()) {
        _posToEntries.put(family.getPOS(), entries);
      }

      // scan through entries
      for (int j = 0; j < entries.length; j++) {
        // index
        EntriesItem eItem = entries[j];
        _stagToEntries.put(eItem.getSupertag() + family.getPOS(), eItem);
        if (eItem.getStem().length() > 0) {
          _stems.put(eItem.getStem() + family.getPOS(), eItem);
        }
        try {
          // gather features
          eItem.getCat().forall(gatherAttrs);
          // record names
          familyAndEntryNames.add(eItem.getName());
          familyAndEntryNames.add(eItem.getQualifiedName());
        } catch (RuntimeException exc) {
          System.err.println("exception for: " + family.getName() + ": " + exc);
        }
      }

      // scan through data
      for (int j = 0; j < data.length; j++) {
        DataItem dItem = data[j];
        _stems.put(
            dItem.getStem() + family.getPOS(), new Pair<DataItem, EntriesItem[]>(dItem, entries));
        // index non-default preds to words
        if (!dItem.getStem().equals(dItem.getPred())) {
          Collection<Word> words = (Collection<Word>) _predToWords.get(dItem.getStem());
          if (words == null) {
            if (!openlex) {
              System.out.print("Warning: couldn't find words for pred '");
              System.out.println(dItem.getPred() + "' with stem '" + dItem.getStem() + "'");
            }
          } else {
            for (Iterator<Word> it = words.iterator(); it.hasNext(); ) {
              _predToWords.put(dItem.getPred(), it.next());
            }
          }
        }
      }

      // index rels to preds
      // nb: this covers relational (eg @x<GenRel>e) and featural (eg @e<tense>past)
      //     elementary predications
      List<String> indexRels = new ArrayList<String>(3);
      String familyIndexRel = family.getIndexRel();
      if (familyIndexRel.length() > 0) {
        indexRels.add(familyIndexRel);
      }
      for (int j = 0; j < entries.length; j++) {
        EntriesItem eItem = entries[j];
        String indexRel = eItem.getIndexRel();
        if (indexRel.length() > 0 && !indexRel.equals(familyIndexRel)) {
          indexRels.add(indexRel);
        }
      }
      for (Iterator<String> it = indexRels.iterator(); it.hasNext(); ) {
        String indexRel = it.next();
        // nb: not indexing on entries items, b/c some stems are still defaults
        for (int j = 0; j < data.length; j++) {
          DataItem dItem = data[j];
          _relsToPreds.put(indexRel, dItem.getPred());
        }
      }

      // index coart rels (features, really) to preds
      String coartRel = family.getCoartRel();
      if (coartRel.length() > 0) {
        for (int j = 0; j < data.length; j++) {
          _coartRelsToPreds.put(coartRel, data[j].getPred());
        }
      }
    }

    // index the macros
    _macros = new GroupMap<String, FeatureStructure>();
    // nb: could just index MacroItem objects for feature structures too;
    //     this might be a bit cleaner, but life is short
    _macroItems = new HashMap<String, MacroItem>();
    for (MacroItem mi : macroModel) {
      String macName = mi.getName();
      FeatureStructure[] specs = mi.getFeatureStructures();
      for (int j = 0; j < specs.length; j++) {
        _macros.put(macName, specs[j]);
      }
      // this is for handling LF part of macros
      _macroItems.put(macName, mi);
    }

    // with morph items, check POS, macro names, excluded list for xref
    for (MorphItem morphItem : morph) {
      Word w = morphItem.getWord();
      if (!openlex
          && !_stems.containsKey(w.getStem() + w.getPOS())
          && !_posToEntries.containsKey(w.getPOS())) {
        System.err.println(
            "Warning: no entries for stem '"
                + w.getStem()
                + "' and POS '"
                + w.getPOS()
                + "' found for word '"
                + w
                + "'");
      }
      String[] macroNames = morphItem.getMacros();
      for (int j = 0; j < macroNames.length; j++) {
        if (!_macroItems.containsKey(macroNames[j])) {
          System.err.println(
              "Warning: macro "
                  + macroNames[j]
                  + " not found for word '"
                  + morphItem.getWord()
                  + "'");
        }
      }
      String[] excludedNames = morphItem.getExcluded();
      for (int j = 0; j < excludedNames.length; j++) {
        if (!familyAndEntryNames.contains(excludedNames[j])) {
          System.err.println(
              "Warning: excluded family or entry '"
                  + excludedNames[j]
                  + "' not found for word '"
                  + morphItem.getWord()
                  + "'");
        }
      }
    }
  }
  /**
   * Main method drives all methods
   *
   * @param args
   * @throws IOException
   */
  public static void main(String[] args) throws IOException {

    String usageError =
        "Please provide a valid option. Such as: "
            + "\n -add FILENAME 				*creates new HITs from the data provided in the given file(s)* "
            + "\n -delete FILENAME        	*deletes all of the HITs with IDs matching those given in the file(s)*"
            + "\n -approveAll FILENAME 		*approves all the assignments for all HITs with IDs in the given file(s)*";

    if (args.length >= 1) {
      // Create an instance of this class.
      LexicalSubSurvey app = new LexicalSubSurvey();
      File inputFile = null;

      try {
        if (args.length > 1) inputFile = new File(args[1]);

        if (args[0].equals("-add")) {
          // When -add tag is given in adds HITs to Mechanical turk depending on the URL in the
          // mturk.properties file

          String[] parts = {
            "NN", "NNS", "JJ", "JJR", "JJS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP",
            "VBZ"
          };
          ArrayList<String> pos = new ArrayList<String>();
          for (int i = 0; i < parts.length; i++) {
            pos.add(parts[i]);
          }

          ExamplePairReader reader = new ExamplePairReader(PARSED, ALIGN);
          BufferedReader in =
              new BufferedReader(
                  new InputStreamReader(
                      new FileInputStream(inputFile))); // typical file name: "sub.simple.first100"
          DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-ddHH:mm:ss");
          Date date = new Date();

          // The three different experiments leave one uncommented at a time to do single groupings
          app.contextpr =
              new PrintWriter(
                  new FileOutputStream(
                      new File(inputFile.getName() + "ContextGivenIDs" + dateFormat.format(date))));
          app.partialContextpr =
              new PrintWriter(
                  new FileOutputStream(
                      new File(
                          inputFile.getName() + "partialContextIDs" + dateFormat.format(date))));
          app.noContextpr =
              new PrintWriter(
                  new FileOutputStream(
                      new File(
                          inputFile.getName() + "NoContextGivenIDs" + dateFormat.format(date))));

          Map<String, String> codeToPOS = new HashMap<String, String>(14);
          codeToPOS.put("NN", "Noun");
          codeToPOS.put("NNS", "Noun");
          codeToPOS.put("JJ", "Adjective");
          codeToPOS.put("JJR", "Adjective");
          codeToPOS.put("JJS", "Adjective");
          codeToPOS.put("RB", "Adverb");
          codeToPOS.put("RBR", "Adverb");
          codeToPOS.put("RBS", "Adverb");
          codeToPOS.put("VB", "Verb");
          codeToPOS.put("VBD", "Verb");
          codeToPOS.put("VBG", "Verb");
          codeToPOS.put("VBN", "Verb");
          codeToPOS.put("VBP", "Verb");
          codeToPOS.put("VBZ", "Verb");

          String input = in.readLine();
          Map<String, String[]> wordToSense = new HashMap<String, String[]>(25);
          String focusWord = "";
          String sense = "";
          String context = "";
          String simpleWord;
          while (input != null) {
            StringTokenizer splitter = new StringTokenizer(input, "\t");
            context = splitter.nextToken();
            splitter.nextToken();
            focusWord = splitter.nextToken();
            simpleWord = splitter.nextToken();
            sense = splitter.nextToken();

            String[] wordAssociations = {context, sense, simpleWord};

            wordToSense.put(focusWord, wordAssociations);

            input = in.readLine();
          }

          for (int k = 0;
              k < 1000000 && reader.hasNext();
              k++) { // for counted input goes through until reaches end or max number
            ExamplePair p = reader.next();
            Alignment align = p.getAlignment();
            ArrayList<Word> normalWords = p.getNormal().getWords();
            ArrayList<Word> simpleWords = p.getSimple().getWords();

            // creates object = list of simple words
            SimpleWordsList simpleWordsList = new SimpleWordsList();

            for (AlignPair pair : align) {
              int n = pair.getNormalIndex();
              int s = pair.getSimpleIndex();
              Word normal = normalWords.get(n);
              Word simple = simpleWords.get(s);
              boolean diffWords =
                  !normal.getWord().toLowerCase().equals(simple.getWord().toLowerCase());
              boolean normWordSimplePOS = pos.contains(normal.getPos());
              boolean posEqual = normal.getPos().equals(simple.getPos());
              boolean normalIsAlreadySimple = simpleWordsList.contains(normal.getWord());
              boolean doWeHaveSense = wordToSense.containsKey(normal.getWord());
              if (doWeHaveSense) context = wordToSense.get(normal.getWord())[0];
              boolean contextMatch = context.equals(p.getNormal().textString());

              if (diffWords
                  && normWordSimplePOS
                  && posEqual
                  && !normalIsAlreadySimple
                  && doWeHaveSense
                  && contextMatch) {
                String firstPart = "";
                String partialFirst = "";
                String wordAfterFocus = normalWords.get(n + 1).getWord();
                String target = normal.getWord();
                if (!(wordAfterFocus.length() == 1 && wordAfterFocus.compareTo("A") < 0)) {
                  target += " ";
                }
                String secondPart = "";
                String partialSecond = "";
                sense = wordToSense.get(normal.getWord())[1];
                String POS = codeToPOS.get(normal.getPos());

                for (int i = 0; i < normalWords.size(); i++) {
                  String currentWord = normalWords.get(i).getWord();
                  String nextWord = "";
                  if (i + 1 < normalWords.size()) {
                    nextWord = normalWords.get(i + 1).getWord();
                  }
                  if (i < n) {
                    if (i > n - 3) partialFirst += currentWord;
                    firstPart += currentWord;
                    if (!(nextWord.length() == 1 && nextWord.compareTo("A") < 0)) {
                      firstPart += " ";
                      if (i > n - 3) partialFirst += " ";
                    }
                  }
                  if (i > n) {
                    if (i < n + 3) partialSecond += currentWord;
                    secondPart += currentWord;
                    if (!(nextWord.length() == 1 && nextWord.compareTo("A") < 0)) {
                      secondPart += " ";
                      if (i < n + 3) partialSecond += " ";
                    }
                  }
                }

                // comment out 2 out of the 3 for single grouping
                app.createContextGivenSurvey(firstPart, target, secondPart);
                app.createPartialContextGivenSurvey(
                    partialFirst, target, partialSecond, sense, POS);
                app.createNoContextGivenSurvey(target, sense, POS);
              }
            }
          }

          // comment out 2 for single grouping
          app.contextpr.close();
          app.partialContextpr.close();
          app.noContextpr.close();

        } else if (args[0].equals("-delete")) { // deletes the hits whose IDs are in the given file
          System.out.println("deleting");
          // IDs are usually stored in these files: NoContextGivenIDs, NoTargetGivenIDs,
          // ContextGivenIDs
          BufferedReader fileReader =
              new BufferedReader(new InputStreamReader(new FileInputStream(inputFile)));
          String hitId = "";

          for (hitId = fileReader.readLine(); hitId != null; hitId = fileReader.readLine()) {
            System.out.println(hitId);
            app.deleteHIT(hitId);
          }

        } else if (args[0].equals(
            "-approveAll")) { // approves all submissions for all hits whose IDs in the given file
          System.out.println("approving");
          // IDs are usually stored in these files: NoContextGivenIDs, NoTargetGivenIDs,
          // ContextGivenIDs
          BufferedReader fileReader =
              new BufferedReader(new InputStreamReader(new FileInputStream(inputFile)));
          String hitId = "";

          for (hitId = fileReader.readLine(); hitId != null; hitId = fileReader.readLine()) {
            System.out.println(hitId);
            app.approveHIT(hitId);
          }

        } else {
          System.err.println("No valid options were provided");
          System.out.println(usageError);
        }

      } catch (IOException e) {
        System.err.println("Could not find the file: \"" + args[1] + "\"");
        System.err.println("Please provide a valid file name");
      }

    } else System.out.println(usageError);
  }
  public static void main(String[] args) throws Exception {
    String[] args2 = new String[3];
    args2[0] =
        "/Users/EsferaDePandora/Copy/2015/CICLO 2/CC2003 ALGORITMOS Y ESTRUCTURAS DE DATOS/Hojas de trabajo/HT09 - BST y MAPEO/HT9 proyecto/src/words.txt";
    args2[1] =
        "/Users/EsferaDePandora/Copy/2015/CICLO 2/CC2003 ALGORITMOS Y ESTRUCTURAS DE DATOS/Hojas de trabajo/HT09 - BST y MAPEO/HT9 proyecto/src/text.txt";
    args2[2] = "5";

    if (args2.length > 1) {
      File wordFile = new File(args2[0]);
      File textFile = new File(args2[1]);

      ///////////////////////////////////////////////////////////////////
      //  1 SimpleSet
      //  2 Red Black Tree
      //  3 Splay Tree
      //  4 Hash Table
      //  5 TreeMap (de java collection framework)
      ///////////////////////////////////////////////////////////////////

      int implementacion = Integer.parseInt(args2[2]);
      BufferedReader wordreader;
      BufferedReader textreader;
      int verbs = 0;
      int nouns = 0;
      int adjectives = 0;
      int adverbs = 0;
      int gerunds = 0;
      long starttime;
      long endtime;

      // VERIFICA QUE PARAMETROS EXISTEN
      if (wordFile.isFile() && textFile.isFile()) {
        // Leer archivos
        try {
          wordreader = new BufferedReader(new FileReader(wordFile));
          textreader = new BufferedReader(new FileReader(textFile));
        } catch (Exception ex) {
          System.out.println("Error al leer!");
          return;
        }

        ///////////////////////////////////////////////////////////////////
        WordSet words = WordSetFactory.generateSet(implementacion);
        ///////////////////////////////////////////////////////////////////

        String line = null;
        String[] wordParts;

        // LEE ARCHIVO
        starttime = System.currentTimeMillis();
        line = wordreader.readLine();
        while (line != null) {
          wordParts = line.split("\\."); // lo que esta entre comillas es una expresi�n regular.
          if (wordParts.length == 2) {
            words.add(new Word(wordParts[0].trim(), wordParts[1].trim()));
          }
          line = wordreader.readLine();
        }
        wordreader.close();
        endtime = System.currentTimeMillis();

        System.out.println("Palabras cargadas en " + (endtime - starttime) + " ms.");

        starttime = System.currentTimeMillis();
        line = textreader.readLine();
        String[] textParts;
        Word currentword;
        Word lookupword = new Word();

        while (line != null) {
          // SEPARA PALABRAS
          textParts = line.split("[^\\w-]+");

          // VERIFICA EL TIPO
          for (int i = 0; i < textParts.length; i++) {
            lookupword.setWord(textParts[i].trim().toLowerCase());
            currentword = words.get(lookupword);
            if (currentword != null) {
              if (currentword.getType().equals("v-d")
                  || currentword.getType().equals("v")
                  || currentword.getType().equals("q")) verbs++;
              else if (currentword.getType().equals("g")) gerunds++;
              else if (currentword.getType().equals("a-s")
                  || currentword.getType().equals("a-c")
                  || currentword.getType().equals("a")) adjectives++;
              else if (currentword.getType().equals("e")) adverbs++;
              else nouns++;
            }
          }

          line = textreader.readLine();
        }
        textreader.close();
        endtime = System.currentTimeMillis();
        System.out.println("Texto analizado en " + (endtime - starttime) + " ms.");

        ///////////////////////////////////////////////////////////////////
        System.out.println("El texto tiene:");
        System.out.println(verbs + " verbos");
        System.out.println(nouns + " sustantivos");
        System.out.println(adjectives + " adjetivos");
        System.out.println(adverbs + " adverbios");
        System.out.println(gerunds + " gerundios");
        ///////////////////////////////////////////////////////////////////
      } else {
        System.out.println("No encuentro los archivos :'( ");
      }
    } else {
      System.out.println("Faltan Parametros.");
    }
  }
예제 #12
0
	protected void out(Word w) {
		out(w.getLangId() + ": " + w.getSpelling() + " (" + w.getSign() + ")");
	}
  /* public methods */
  public DirectedGraph parse() {
    /* add the parts of speech */
    addPOS();
    /* update all node to include parts of speech */
    System.out.println("Parsing file for parts of speech analysis...");

    byte[] buffer = new byte[100000]; // 100 kb
    try {
      /* retrieve the data from the file */
      FileInputStream fin = new FileInputStream(file);
      fin.read(buffer);
      fin.close();
    } catch (Exception e) {
      System.out.println("IO Error: " + e);
    }

    /* transfer data to a string */
    String data = new String(buffer);

    /* create a tokenizer to parse the data */
    StringTokenizer st =
        new StringTokenizer(
            data, " :;\"\n\t\r_,.!?`\u2015\u2012\u2014\u2013\u2212"); // unicode for dash

    /* temporary variables */
    String pos = "";
    String wordString = "";
    Word word = new Word("");
    Node node = new Node(word);
    while (st.hasMoreTokens()) {
      /* take care of extraneous hiphens */
      String test = st.nextToken();
      while (test.equals("-")) {
        test = st.nextToken();
      }
      while (test.equals("--")) {
        test = st.nextToken();
      }

      /* put the string to lowercase */
      wordString = test;
      wordString = wordString.toLowerCase();

      /* get the POS */
      if (st.hasMoreTokens()) {
        pos = st.nextToken();
      }
      /* if we have the possessive case */
      if (wordString.equals("'s")) {
        /* create a node object from the previous iteration */
        Word possessiveWord = new Word(word.toString() + "'s");
        node = new Node(possessiveWord);
        /* get the position of the node in the graph */
        int index = result.findIndex(node);
        /* add the possessive node to the graph */
        if (index >= 0) {
          /* make sure we get all the associations */
          node = result.nodeAt(index);
          /* transfer the part of speech */
          node.getWord().setPartOfSpeech(word.getPartOfSpeech());
          /* add the possessive quality */
          node.getWord().setPossessive();
          /* insert the node into our array at the right position */
          (result.getNodes())[index] = node;
        }

      } else {
        word = new Word(wordString);
        /* lots of if statements */
        if (pos.equals("AFX")) {
          word.setPartOfSpeech(1);
        } else if (pos.equals("CC")) {
          word.setPartOfSpeech(2);
        } else if (pos.equals("CD")) {
          word.setPartOfSpeech(3);
        } else if (pos.equals("DT")) {
          word.setPartOfSpeech(4);
        } else if (pos.equals("EX")) {
          word.setPartOfSpeech(5);
        } else if (pos.equals("FW")) {
          word.setPartOfSpeech(6);
        } else if (pos.equals("IN")) {
          word.setPartOfSpeech(7);
        } else if (pos.equals("JJ")) {
          word.setPartOfSpeech(8);
        } else if (pos.equals("JJR")) {
          word.setPartOfSpeech(9);
        } else if (pos.equals("JJS")) {
          word.setPartOfSpeech(10);
        } else if (pos.equals("LS")) {
          word.setPartOfSpeech(11);
        } else if (pos.equals("MD")) {
          word.setPartOfSpeech(12);
        } else if (pos.equals("NN")) {
          word.setPartOfSpeech(13);
        } else if (pos.equals("NNP")) {
          word.setPartOfSpeech(14);
        } else if (pos.equals("NNPS")) {
          word.setPartOfSpeech(15);
        } else if (pos.equals("NNS")) {
          word.setPartOfSpeech(16);
        } else if (pos.equals("PDT")) {
          word.setPartOfSpeech(17);
        } else if (pos.equals("POS")) {
          word.setPartOfSpeech(18);
        } else if (pos.equals("PRP")) {
          word.setPartOfSpeech(19);
        } else if (pos.equals("PRP$")) {
          word.setPartOfSpeech(20);
        } else if (pos.equals("RB")) {
          word.setPartOfSpeech(21);
        } else if (pos.equals("RBR")) {
          word.setPartOfSpeech(22);
        } else if (pos.equals("RBS")) {
          word.setPartOfSpeech(23);
        } else if (pos.equals("RP")) {
          word.setPartOfSpeech(24);
        } else if (pos.equals("SYM")) {
          word.setPartOfSpeech(25);
        } else if (pos.equals("TO")) {
          word.setPartOfSpeech(26);
        } else if (pos.equals("UH")) {
          word.setPartOfSpeech(27);
        } else if (pos.equals("VB")) {
          word.setPartOfSpeech(28);
        } else if (pos.equals("VBD")) {
          word.setPartOfSpeech(29);
        } else if (pos.equals("VBG")) {
          word.setPartOfSpeech(30);
        } else if (pos.equals("VBN")) {
          word.setPartOfSpeech(31);
        } else if (pos.equals("VBP")) {
          word.setPartOfSpeech(32);
        } else if (pos.equals("VBZ")) {
          word.setPartOfSpeech(33);
        } else if (pos.equals("WDT")) {
          word.setPartOfSpeech(34);
        } else if (pos.equals("WP")) {
          word.setPartOfSpeech(35);
        } else if (pos.equals("WPS")) {
          word.setPartOfSpeech(36);
        } else if (pos.equals("WRB")) {
          word.setPartOfSpeech(37);
        }

        node = new Node(word);

        int index = result.findIndex(node);
        if (index >= 0) {
          /* make sure we get all the associations */
          node = result.nodeAt(index);
          /* transfer the part of speech */
          node.getWord().setPartOfSpeech(word.getPartOfSpeech());
          /* insert the node into our array at the right position */
          (result.getNodes())[index] = node;
        }
      }
    } // end while
    return result;
  }