public static void extractPhrase() { System.out.println("name of forward alignment"); String fwA = UTIL_UserInput.fileNameInput(); HashMap<Bitext, AlignPair> currentMap = (HashMap<Bitext, AlignPair>) UTIL_FileOperations.openObject(fwA); System.out.println("name of backward alignment"); String bwA = UTIL_UserInput.fileNameInput(); HashMap<Bitext, AlignPair> bwAlignment = (HashMap<Bitext, AlignPair>) UTIL_FileOperations.openObject(bwA); String mapName = "forward.aMap"; for (int i = 0; i < 2; i++) { if (i == 1) { currentMap = bwAlignment; mapName = "reverse.aMap"; } // Iterator<Entry<Bitext, AlignPair>> fIt = fwAlignment.entrySet() // .iterator(); HashMap<Bitext, HashMap<Integer, Integer>> alignmentMap = new HashMap<Bitext, HashMap<Integer, Integer>>(); Iterator<Entry<Bitext, AlignPair>> cait = currentMap.entrySet().iterator(); while (cait.hasNext()) { HashMap<Integer, Integer> sourceTargetPair = new HashMap<Integer, Integer>(); Map.Entry<Bitext, AlignPair> pair = cait.next(); AlignPair forwardAlign = pair.getValue(); Bitext currentBitext = pair.getKey(); AlignPair backwardAlign = bwAlignment.get(currentBitext); ArrayList<WordPair> forwardViterbi = forwardAlign.getAlignments(); Integer sourceIndex = 0; ArrayList<Integer> targetsUsed = new ArrayList<Integer>(); // for forward alignment for (String s : currentBitext.getSource()) { Integer targetIndex = 0; Integer closestTarget = currentBitext.getTarget().length; for (WordPair wp : forwardViterbi) { if (wp.e.equals(s)) { String target = wp.f; ArrayList<Integer> targetPositions = new ArrayList<Integer>(); for (String t : currentBitext.getTarget()) { if (t.equals(target)) { targetPositions.add(targetIndex); } targetIndex++; } Integer source = sourceIndex; if (targetPositions.size() > 1) { System.out.println("multiple target words"); Double sourcePercentage = (double) (sourceIndex / currentBitext.getSource().length); Double smallest = 100000000000000d; for (Integer tp : targetPositions) { Double targetPercentage = (double) (tp / currentBitext.getTarget().length); Double distance = Math.abs(sourcePercentage - targetPercentage); if (smallest > distance) { smallest = distance; closestTarget = tp; } } } else if (targetPositions.size() == 1) { closestTarget = targetPositions.get(0); } break; } } System.out.println(sourceIndex); System.out.println(closestTarget); if (closestTarget >= currentBitext.getTarget().length) { System.out.println("Something is wrong"); System.exit(1); } sourceTargetPair.put(sourceIndex, closestTarget); sourceIndex++; } System.out.println("For bitext id: " + currentBitext.hashCode()); Iterator<Entry<Integer, Integer>> ait = sourceTargetPair.entrySet().iterator(); while (ait.hasNext()) { Map.Entry<Integer, Integer> ip = ait.next(); System.out.println( ip.getKey() + " : " + ip.getValue() + "----" + currentBitext.getSource()[ip.getKey()] + " : " + currentBitext.getTarget()[ip.getValue()]); } alignmentMap.put(currentBitext, sourceTargetPair); } UTIL_FileOperations.store(alignmentMap, mapName); } }
/** * Main method drives all methods * * @param args * @throws IOException */ public static void main(String[] args) throws IOException { String usageError = "Please provide a valid option. Such as: " + "\n -add FILENAME *creates new HITs from the data provided in the given file(s)* " + "\n -delete FILENAME *deletes all of the HITs with IDs matching those given in the file(s)*" + "\n -approveAll FILENAME *approves all the assignments for all HITs with IDs in the given file(s)*"; if (args.length >= 1) { // Create an instance of this class. LexicalSubSurvey app = new LexicalSubSurvey(); File inputFile = null; try { if (args.length > 1) inputFile = new File(args[1]); if (args[0].equals("-add")) { // When -add tag is given in adds HITs to Mechanical turk depending on the URL in the // mturk.properties file String[] parts = { "NN", "NNS", "JJ", "JJR", "JJS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ" }; ArrayList<String> pos = new ArrayList<String>(); for (int i = 0; i < parts.length; i++) { pos.add(parts[i]); } ExamplePairReader reader = new ExamplePairReader(PARSED, ALIGN); BufferedReader in = new BufferedReader( new InputStreamReader( new FileInputStream(inputFile))); // typical file name: "sub.simple.first100" DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-ddHH:mm:ss"); Date date = new Date(); // The three different experiments leave one uncommented at a time to do single groupings app.contextpr = new PrintWriter( new FileOutputStream( new File(inputFile.getName() + "ContextGivenIDs" + dateFormat.format(date)))); app.partialContextpr = new PrintWriter( new FileOutputStream( new File( inputFile.getName() + "partialContextIDs" + dateFormat.format(date)))); app.noContextpr = new PrintWriter( new FileOutputStream( new File( inputFile.getName() + "NoContextGivenIDs" + dateFormat.format(date)))); Map<String, String> codeToPOS = new HashMap<String, String>(14); codeToPOS.put("NN", "Noun"); codeToPOS.put("NNS", "Noun"); codeToPOS.put("JJ", "Adjective"); codeToPOS.put("JJR", "Adjective"); codeToPOS.put("JJS", "Adjective"); codeToPOS.put("RB", "Adverb"); codeToPOS.put("RBR", "Adverb"); codeToPOS.put("RBS", "Adverb"); codeToPOS.put("VB", "Verb"); codeToPOS.put("VBD", "Verb"); codeToPOS.put("VBG", "Verb"); codeToPOS.put("VBN", "Verb"); codeToPOS.put("VBP", "Verb"); codeToPOS.put("VBZ", "Verb"); String input = in.readLine(); Map<String, String[]> wordToSense = new HashMap<String, String[]>(25); String focusWord = ""; String sense = ""; String context = ""; String simpleWord; while (input != null) { StringTokenizer splitter = new StringTokenizer(input, "\t"); context = splitter.nextToken(); splitter.nextToken(); focusWord = splitter.nextToken(); simpleWord = splitter.nextToken(); sense = splitter.nextToken(); String[] wordAssociations = {context, sense, simpleWord}; wordToSense.put(focusWord, wordAssociations); input = in.readLine(); } for (int k = 0; k < 1000000 && reader.hasNext(); k++) { // for counted input goes through until reaches end or max number ExamplePair p = reader.next(); Alignment align = p.getAlignment(); ArrayList<Word> normalWords = p.getNormal().getWords(); ArrayList<Word> simpleWords = p.getSimple().getWords(); // creates object = list of simple words SimpleWordsList simpleWordsList = new SimpleWordsList(); for (AlignPair pair : align) { int n = pair.getNormalIndex(); int s = pair.getSimpleIndex(); Word normal = normalWords.get(n); Word simple = simpleWords.get(s); boolean diffWords = !normal.getWord().toLowerCase().equals(simple.getWord().toLowerCase()); boolean normWordSimplePOS = pos.contains(normal.getPos()); boolean posEqual = normal.getPos().equals(simple.getPos()); boolean normalIsAlreadySimple = simpleWordsList.contains(normal.getWord()); boolean doWeHaveSense = wordToSense.containsKey(normal.getWord()); if (doWeHaveSense) context = wordToSense.get(normal.getWord())[0]; boolean contextMatch = context.equals(p.getNormal().textString()); if (diffWords && normWordSimplePOS && posEqual && !normalIsAlreadySimple && doWeHaveSense && contextMatch) { String firstPart = ""; String partialFirst = ""; String wordAfterFocus = normalWords.get(n + 1).getWord(); String target = normal.getWord(); if (!(wordAfterFocus.length() == 1 && wordAfterFocus.compareTo("A") < 0)) { target += " "; } String secondPart = ""; String partialSecond = ""; sense = wordToSense.get(normal.getWord())[1]; String POS = codeToPOS.get(normal.getPos()); for (int i = 0; i < normalWords.size(); i++) { String currentWord = normalWords.get(i).getWord(); String nextWord = ""; if (i + 1 < normalWords.size()) { nextWord = normalWords.get(i + 1).getWord(); } if (i < n) { if (i > n - 3) partialFirst += currentWord; firstPart += currentWord; if (!(nextWord.length() == 1 && nextWord.compareTo("A") < 0)) { firstPart += " "; if (i > n - 3) partialFirst += " "; } } if (i > n) { if (i < n + 3) partialSecond += currentWord; secondPart += currentWord; if (!(nextWord.length() == 1 && nextWord.compareTo("A") < 0)) { secondPart += " "; if (i < n + 3) partialSecond += " "; } } } // comment out 2 out of the 3 for single grouping app.createContextGivenSurvey(firstPart, target, secondPart); app.createPartialContextGivenSurvey( partialFirst, target, partialSecond, sense, POS); app.createNoContextGivenSurvey(target, sense, POS); } } } // comment out 2 for single grouping app.contextpr.close(); app.partialContextpr.close(); app.noContextpr.close(); } else if (args[0].equals("-delete")) { // deletes the hits whose IDs are in the given file System.out.println("deleting"); // IDs are usually stored in these files: NoContextGivenIDs, NoTargetGivenIDs, // ContextGivenIDs BufferedReader fileReader = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile))); String hitId = ""; for (hitId = fileReader.readLine(); hitId != null; hitId = fileReader.readLine()) { System.out.println(hitId); app.deleteHIT(hitId); } } else if (args[0].equals( "-approveAll")) { // approves all submissions for all hits whose IDs in the given file System.out.println("approving"); // IDs are usually stored in these files: NoContextGivenIDs, NoTargetGivenIDs, // ContextGivenIDs BufferedReader fileReader = new BufferedReader(new InputStreamReader(new FileInputStream(inputFile))); String hitId = ""; for (hitId = fileReader.readLine(); hitId != null; hitId = fileReader.readLine()) { System.out.println(hitId); app.approveHIT(hitId); } } else { System.err.println("No valid options were provided"); System.out.println(usageError); } } catch (IOException e) { System.err.println("Could not find the file: \"" + args[1] + "\""); System.err.println("Please provide a valid file name"); } } else System.out.println(usageError); }