public void testSkewedJoinWithNoProperties() throws IOException { pigServer = new PigServer(ExecType.MAPREDUCE, cluster.getProperties()); pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); DataBag dbshj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery( "C = join A by (id, name), B by (id, name) using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join A by(id, name), B by (id, name);"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbshj.add(iter.next()); } } Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj)); } catch (Exception e) { fail(e.getMessage()); } }
@Override public DataBag exec(Tuple input) throws IOException { try { DataBag bag = DefaultBagFactory.getInstance().newDefaultBag(); if (input == null || input.size() == 0) { return bag; // an empty bag } if (this.fieldType == DataType.MAP) { Tuple t = DefaultTupleFactory.getInstance().newTuple(1); t.set(0, createMap(input)); bag.add(t); } else { bag.add(input); } return bag; } catch (Exception e) { throw new RuntimeException( "Error while computing size in " + this.getClass().getSimpleName()); } }
@Override public Tuple exec(Tuple input) throws IOException { Tuple output = tupleFactory.newTuple(); DataBag selected = bagFactory.newDefaultBag(); DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator()); DataBag items = (DataBag) input.get(0); if (items != null) { long n = items.size(); double q1 = getQ1(n, _samplingProbability); double q2 = getQ2(n, _samplingProbability); for (Tuple item : items) { double key = _rdg.nextUniform(0.0d, 1.0d); if (key < q1) { selected.add(item); } else if (key < q2) { waiting.add(new ScoredTuple(key, item).getIntermediateTuple(tupleFactory)); } } output.append(n); output.append(selected); output.append(waiting); } return output; }
// pig 1048 public void testSkewedJoinOneValue() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE3 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE3 + "' as (id,name);"); // Filter key with a single value pigServer.registerQuery("C = FILTER A by id == 400;"); pigServer.registerQuery("D = FILTER B by id == 400;"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join C by id, D by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join C by id, D by id;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbrj.add(iter.next()); } } Assert.assertEquals(dbfrj.size(), dbrj.size()); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj)); }
@Override public Tuple exec(Tuple input) throws IOException { DataBag output = BagFactory.getInstance().newDefaultBag(); DataBag samples = (DataBag) input.get(0); if (samples == null) { // do nothing } else if (samples.size() <= numSamples) { // no need to construct a reservoir, so just emit intermediate tuples for (Tuple sample : samples) { // add the score on to the intermediate tuple output.add(new ScoredTuple(Math.random(), sample).getIntermediateTuple(tupleFactory)); } } else { for (Tuple sample : samples) { getReservoir().consider(new ScoredTuple(Math.random(), sample)); } for (ScoredTuple scoredTuple : getReservoir()) { // add the score on to the intermediate tuple output.add(scoredTuple.getIntermediateTuple(tupleFactory)); } } return tupleFactory.newTuple(output); }
public void testSkewedJoinWithGroup() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); pigServer.registerQuery("C = GROUP A by id;"); pigServer.registerQuery("D = GROUP B by id;"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbshj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join C by group, D by group using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join C by group, D by group;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbshj.add(iter.next()); } } Assert.assertTrue(dbfrj.size() > 0 && dbshj.size() > 0); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbshj)); }
public void testSkewedJoinManyReducers() throws IOException { pigServer.getPigContext().getProperties().setProperty("pig.skewedjoin.reduce.maxtuple", "2"); pigServer.registerQuery("A = LOAD '" + INPUT_FILE6 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE7 + "' as (id,name);"); DataBag dbfrj = BagFactory.getInstance().newDefaultBag(), dbrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("E = join A by id, B by id using \"skewed\" parallel 300;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("E = join A by id, B by id;"); Iterator<Tuple> iter = pigServer.openIterator("E"); while (iter.hasNext()) { dbrj.add(iter.next()); } } Assert.assertEquals(dbfrj.size(), dbrj.size()); Assert.assertEquals(true, TestHelper.compareBags(dbfrj, dbrj)); }
public DataBag exec(Tuple input) throws IOException { try { if (!input.isNull()) { // Create the output like a databag {(res1,res2),(res3,res4)..} DataBag output_databag = mBagFactory.newDefaultBag(); // Unpack tuple in order to get the bag {(1,2),(3,4),...} String input_time = (String) input.get(0); try { DateFormat formatter = new SimpleDateFormat("MM/dd/yyyy kk:mm:ss"); Date date = formatter.parse( String.format( "%s/%s/%s %s:%s:%s", input_time.substring(5, 7), input_time.substring(8, 10), input_time.substring(0, 4), input_time.substring(11, 13), input_time.substring(14, 16), input_time.substring(17, 18))); Calendar calendar = Calendar.getInstance(); calendar.setTime(date); int dayOfWeek = calendar.get(Calendar.DAY_OF_WEEK); int dayOfMonth = calendar.get(Calendar.DAY_OF_MONTH); int hour = calendar.get(Calendar.HOUR_OF_DAY); // Add items to output Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, String.format("%d:%d:%d", dayOfWeek, dayOfMonth, hour)); output_databag.add(items); } catch (Exception e) { Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, "petting #1" + e.getMessage()); output_databag.add(items); return output_databag; } return output_databag; } else { DataBag output_databag = mBagFactory.newDefaultBag(); Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, "petting #2"); output_databag.add(items); return output_databag; } } catch (Exception e) { System.err.println("Error with ?? .."); DataBag output_databag = mBagFactory.newDefaultBag(); Tuple items = TupleFactory.getInstance().newTuple(1); items.set(0, "petting #3" + e.getMessage()); output_databag.add(items); return output_databag; } }
private DataBag consumeBag(PushbackInputStream in, ResourceFieldSchema fieldSchema) throws IOException { if (fieldSchema == null) { throw new IOException("Schema is null"); } ResourceFieldSchema[] fss = fieldSchema.getSchema().getFields(); Tuple t; int buf; while ((buf = in.read()) != '{') { if (buf == -1) { throw new IOException("Unexpect end of bag"); } } if (fss.length != 1) throw new IOException("Only tuple is allowed inside bag schema"); ResourceFieldSchema fs = fss[0]; DataBag db = DefaultBagFactory.getInstance().newDefaultBag(); while (true) { t = consumeTuple(in, fs); if (t != null) db.add(t); while ((buf = in.read()) != '}' && buf != ',') { if (buf == -1) { throw new IOException("Unexpect end of bag"); } } if (buf == '}') break; } return db; }
@Override public DataBag exec(Tuple input) throws IOException { DataBag bag = (DataBag) input.get(0); HashMap<String, Double> clsCnt = new HashMap<String, Double>(); Iterator<Tuple> it = bag.iterator(); Double sum = new Double(0.0); while (it.hasNext()) { Tuple item = (Tuple) it.next(); String cls = (String) item.get(3); if (cls != null && cls.length() > 0) { Double cur = clsCnt.get(cls); Double inc = (Double) item.get(2); if (cur != null) { clsCnt.put(cls, cur + inc); } else { clsCnt.put(cls, inc); } sum += inc; } } Set<Entry<String, Double>> clses = clsCnt.entrySet(); Iterator<Entry<String, Double>> cit = clses.iterator(); DataBag result = bagFactory.newDefaultBag(); while (cit.hasNext()) { Entry<String, Double> cls = cit.next(); Tuple tpl = tupleFactory.newTuple(2); tpl.set(0, cls.getKey()); tpl.set(1, cls.getValue() / sum); result.add(tpl); } return result; }
public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } String normStr = ((String) input.get(0)); if (normStr == null) { return null; } // Remove punctuation except when it's a version number normStr = punctPattern.matcher(normStr.trim().toLowerCase()).replaceAll(" "); normStr = spacePattern.matcher(normStr).replaceAll(" "); DataBag output = bagFactory.newDefaultBag(); for (String s : spacePattern.split(normStr.trim())) { if (s.length() <= 30) { Tuple t = tupleFactory.newTuple(1); t.set(0, s); output.add(t); } } return output; }
@Override public DataBag exec(Tuple input) throws IOException { DataBag bag = (DataBag) input.get(0); long n = 0L; DataBag selected = bagFactory.newDefaultBag(); DataBag waiting = bagFactory.newSortedBag(new ScoredTupleComparator()); for (Tuple innerTuple : bag) { n += (Long) innerTuple.get(0); selected.addAll((DataBag) innerTuple.get(1)); waiting.addAll((DataBag) innerTuple.get(2)); } long sampleSize = (long) Math.ceil(_samplingProbability * n); long nNeeded = sampleSize - selected.size(); for (Tuple scored : waiting) { if (nNeeded <= 0) { break; } selected.add(ScoredTuple.fromIntermediateTuple(scored).getTuple()); nNeeded--; } return selected; }
@Test public void testCase1() throws IOException { Tuple input = TupleFactory.getInstance().newTuple(2); Tuple groupInfo = TupleFactory.getInstance().newTuple(2); groupInfo.set(0, "column_3"); groupInfo.set(1, Integer.valueOf(1)); DataBag dataBag = new DefaultDataBag(); // {(PSIColumn: int, columnId: int, value: chararray, tag: boolean , rand: int)} for (int i = 0; i < 10; i++) { Tuple t = TupleFactory.getInstance().newTuple(4); t.set(0, Integer.valueOf(1)); t.set(1, Integer.valueOf(1)); t.set(2, array[i]); dataBag.add(t); } input.set(0, groupInfo); input.set(1, dataBag); Tuple output = inst.exec(input); Assert.assertEquals(output.get(0), 1); String[] outputArray = output.get(1).toString().split(String.valueOf(CalculateStatsUDF.CATEGORY_VAL_SEPARATOR)); Assert.assertEquals(outputArray[0], "1"); Assert.assertEquals(outputArray[1], "2"); Assert.assertEquals(outputArray[2], "0"); }
@Override public DataBag extract(Object o, String lang) { DocumentMetadata dm = (DocumentMetadata) o; DataBag db = new DefaultDataBag(); DiacriticsRemover DR = new DiacriticsRemover(); for (TextWithLanguage title : dm.getBasicMetadata().getTitleList()) { if (lang != null && !lang.equalsIgnoreCase(title.getLanguage())) { continue; } String sTitle = title.getText(); String normalized_title = (String) DR.normalize(sTitle); if (normalized_title == null) { continue; } String[] normals = normalized_title.split("[\\W]+"); for (String s : normals) { if (s.isEmpty()) { continue; } Object normalized = normalizeExtracted(s); if (normalized == null) { continue; } Tuple t = TupleFactory.getInstance().newTuple(normalized); db.add(t); } } return db; }
@Override public DataBag getValue() { DataBag output = BagFactory.getInstance().newDefaultBag(); for (ScoredTuple sample : getReservoir()) { output.add(sample.getTuple()); } return output; }
/** * create bag having given number of tuples * * @param size * @return */ private DataBag createBag(int size) { Tuple innerTuple = TupleFactory.getInstance().newTuple(); innerTuple.append(Integer.valueOf(1)); DataBag bag = BagFactory.getInstance().newDefaultBag(); for (int i = 0; i < size; i++) { bag.add(innerTuple); } return bag; }
@SuppressWarnings("unchecked") @Override public DataBag exec(Tuple input) throws IOException { if (input.size() < 2) { throw new RuntimeException("Expected at least two inputs, but found " + input.size()); } for (Object o : input) { if (o != null && !(o instanceof DataBag)) { throw new RuntimeException("Inputs must be bags"); } } DataBag outputBag = bagFactory.newDefaultBag(); DataBag bag1 = (DataBag) input.get(0); DataBag bag2 = (DataBag) input.get(1); if (bag1 == null || bag1.size() == 0) { return outputBag; } // optimization else if (input.size() == 2 && (bag2 == null || bag2.size() == 0)) { return bag1; } PriorityQueue<Pair> pq = loadBags(input); Tuple lastData = null; while (true) { Pair nextPair = pq.peek(); // ignore data we've already encountered if (nextPair.data.compareTo(lastData) != 0) { // Only take data from the first bag, where there are no other // bags that have the same data. if (nextPair.index.equals(0) && countMatches(pq) == 0) { outputBag.add(nextPair.data); lastData = nextPair.data; } } Pair p = pq.poll(); // only put the bag back into the queue if it still has data if (p.hasNext()) { p.next(); pq.offer(p); } else if (p.index.equals(0)) { // stop when we exhaust all elements from the first bag break; } } return outputBag; }
@Override public Tuple exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } Object obj = null; Integer limnum = null; try { obj = (DataByteArray) input.get(1); } catch (ExecException e) { logger.error("Error in reading field proto:", e); throw e; } try { limnum = (Integer) input.get(2); } catch (ExecException e) { logger.error("Error in reading baglimit:", e); throw e; } DataByteArray dba = null; try { dba = (DataByteArray) obj; } catch (ClassCastException e) { logger.error("Error in casting Object (" + input.getType(1) + ") to DataByteArray:", e); throw e; } DocumentMetadata dm = null; try { dm = DocumentMetadata.parseFrom(dba.get()); } catch (InvalidProtocolBufferException e) { logger.error("Error in reading ByteArray to DocumentMetadata:", e); throw e; } String key = dm.getKey(); DataBag db = new DefaultDataBag(); int bagsize = 0; for (ClassifCode code : dm.getBasicMetadata().getClassifCodeList()) { for (String co_str : code.getValueList()) { bagsize++; db.add(TupleFactory.getInstance().newTuple(co_str)); } } if (bagsize > limnum) { Object[] to = new Object[] {key, db, bagsize}; return TupleFactory.getInstance().newTuple(Arrays.asList(to)); } return null; }
@Override public void setInput(Map<String, Block> input, JsonNode operatorJson, BlockProperties props) throws IOException, InterruptedException { inputBlock = input.values().iterator().next(); init(operatorJson, inputBlock.getProperties().getSchema()); nullBag = BagFactory.getInstance().newDefaultBag(); nullBag.add(TupleFactory.getInstance().newTuple(0)); }
@Override public DataBag exec(Tuple input) throws IOException { retrieveContextValues(); ArrayList<String> joinKeyNames = new ArrayList<String>(); for (int i = 1; i < input.size(); i += 2) { joinKeyNames.add((String) input.get(i)); } JoinCollector collector = new JoinCollector(); // the first bag is the outer bag String leftBagName = bagNames.get(0); DataBag leftBag = getBag(input, leftBagName); String leftBagJoinKeyName = getPrefixedAliasName(bagNameToJoinKeyPrefix.get(leftBagName), joinKeyNames.get(0)); collector.setJoinData(collector.groupTuples(leftBag, leftBagJoinKeyName)); // now, for each additional bag, group up the tuples by the join key, then join them in if (bagNames.size() > 1) { for (int i = 1; i < bagNames.size(); i++) { String bagName = bagNames.get(i); DataBag bag = getBag(input, bagName); String joinKeyName = getPrefixedAliasName(bagNameToJoinKeyPrefix.get(bagName), joinKeyNames.get(i)); int tupleSize = bagNameToSize.get(bagName); if (bag == null) throw new IOException( "Error in instance: " + getInstanceName() + " with properties: " + getInstanceProperties() + " and tuple: " + input.toDelimitedString(", ") + " -- Expected bag, got null"); HashMap<Object, List<Tuple>> groupedData = collector.groupTuples(bag, joinKeyName); // outer join, so go back in and add nulls; groupedData = collector.insertNullTuples(groupedData, tupleSize); for (Map.Entry<Object, List<Tuple>> entry : groupedData.entrySet()) { collector.joinTuples(entry.getKey(), entry.getValue()); } } } // assemble output bag DataBag outputBag = BagFactory.getInstance().newDefaultBag(); for (List<Tuple> tuples : collector.getJoinData().values()) { for (Tuple tuple : tuples) { outputBag.add(tuple); } } return outputBag; }
public void testSkewedJoinOuter() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE5 + "' as (id,name);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE5 + "' as (id,name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("C = join A by id left, B by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("C = join A by id right, B by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } { pigServer.registerQuery("C = join A by id full, B by id using \"skewed\";"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { System.out.println(e.getMessage()); e.printStackTrace(); fail("Should support outer join in skewed join"); } return; }
public DataBag exec(Tuple input) throws IOException { try { DataBag bag = BagFactory.getInstance().newDefaultBag(); for (int i = 0; i < input.size(); i++) { final Object object = input.get(i); if (object instanceof Tuple) { for (int j = 0; j < ((Tuple) object).size(); j++) { Tuple tp2 = TupleFactory.getInstance().newTuple(1); tp2.set(0, ((Tuple) object).get(j)); bag.add(tp2); } } else { Tuple tp2 = TupleFactory.getInstance().newTuple(1); tp2.set(0, object); bag.add(tp2); } } return bag; } catch (Exception ee) { throw new RuntimeException("Error while creating a bag", ee); } }
public void testSkewedJoinReducers() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("C = join A by id, B by id using \"skewed\" parallel 1;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { fail("Should not throw exception, should continue execution"); } }
@Override public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() == 0) { return null; } try { DataByteArray dba = null; try { dba = (DataByteArray) input.get(0); } catch (ExecException e) { logger.error("Error in reading field:", e); throw e; } DocumentWrapper dm = null; try { dm = DocumentWrapper.parseFrom(dba.get()); } catch (Exception e) { logger.error("Error in reading ByteArray to DocumentMetadata:", e); throw e; } DataBag ret = new DefaultDataBag(); DataByteArray metadata = new DataByteArray(dm.getDocumentMetadata().toByteArray()); List<Author> authors = dm.getDocumentMetadata().getBasicMetadata().getAuthorList(); for (int i = 0; i < authors.size(); i++) { String sname = authors.get(i).getSurname(); Object[] to = new Object[] {sname, metadata, i}; Tuple t = TupleFactory.getInstance().newTuple(Arrays.asList(to)); ret.add(t); } return ret; } catch (Exception e) { logger.error("Error in processing input row:", e); throw new IOException( "Caught exception processing input row:\n" + StackTraceExtractor.getStackTrace(e)); } }
public DataBag exec(Tuple input) throws IOException { if (input == null || input.size() < 1 || input.isNull(0)) return null; // Output bag DataBag bagOfTokens = bagFactory.newDefaultBag(); StringReader textInput = new StringReader(input.get(0).toString()); PTBTokenizer ptbt = new PTBTokenizer(textInput, new CoreLabelTokenFactory(), ""); for (CoreLabel label; ptbt.hasNext(); ) { label = (CoreLabel) ptbt.next(); if (label.value().length() > 2) { System.err.println(label.toString()); Tuple termText = tupleFactory.newTuple(label.word()); bagOfTokens.add(termText); } } return bagOfTokens; }
@Override public DataBag exec(Tuple input) throws IOException { DataBag bagOfSamples = (DataBag) input.get(0); for (Tuple innerTuple : bagOfSamples) { DataBag samples = (DataBag) innerTuple.get(0); for (Tuple sample : samples) { // use the same score as previously generated getReservoir().consider(ScoredTuple.fromIntermediateTuple(sample)); } } DataBag output = BagFactory.getInstance().newDefaultBag(); for (ScoredTuple scoredTuple : getReservoir()) { // output the original tuple output.add(scoredTuple.getTuple()); } return output; }
public void testSkewedJoin3Way() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE1 + "' as (id, name, n);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE2 + "' as (id, name);"); pigServer.registerQuery("C = LOAD '" + INPUT_FILE3 + "' as (id, name);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery("D = join A by id, B by id, C by id using \"skewed\" parallel 5;"); Iterator<Tuple> iter = pigServer.openIterator("D"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { return; } fail("Should throw exception, do not support 3 way join"); }
@SuppressWarnings("unchecked") private void accumulateData() throws ExecException { int count = 0; int length = inputs.size() - 1; inputBags = new DataBag[length]; its = new Iterator[length]; for (int i = 0; i < length; ++i) { PhysicalOperator op = inputs.get(i); DataBag bag = BagFactory.getInstance().newDefaultBag(); inputBags[count] = bag; for (Result res = op.getNextTuple(); res.returnStatus != POStatus.STATUS_EOP; res = op.getNextTuple()) { if (res.returnStatus == POStatus.STATUS_NULL) continue; if (res.returnStatus == POStatus.STATUS_ERR) throw new ExecException("Error accumulating data in the local Cross operator"); if (res.returnStatus == POStatus.STATUS_OK) bag.add((Tuple) res.result); } its[count++] = bag.iterator(); } }
public void testSkewedJoinMapKey() throws IOException { pigServer.registerQuery("A = LOAD '" + INPUT_FILE4 + "' as (m:[]);"); pigServer.registerQuery("B = LOAD '" + INPUT_FILE4 + "' as (n:[]);"); try { DataBag dbfrj = BagFactory.getInstance().newDefaultBag(); { pigServer.registerQuery( "C = join A by (chararray)m#'a100', B by (chararray)n#'a100' using \"skewed\" parallel 20;"); Iterator<Tuple> iter = pigServer.openIterator("C"); while (iter.hasNext()) { dbfrj.add(iter.next()); } } } catch (Exception e) { System.out.println(e.getMessage()); e.printStackTrace(); fail("Should support maps and expression operators as keys"); } return; }
private Pair<String, DataBag> extractLangKeywords(DocumentMetadata dm) { List<String> kws = new ArrayList<String>(); Set<String> ctgs = new HashSet<String>(); for (KeywordsList kwl : dm.getKeywordsList()) { if (language.equalsIgnoreCase(kwl.getLanguage())) { for (String str : kwl.getKeywordsList()) { if (isClassifCode(str)) { ctgs.add(str); continue; } if (action == Action.TRANSLATE) { str = translateNonAlphaNumeric(str); } else if (action == Action.REMOVE_KEYCHARACTERS) { str = removeAllKeyPunctations(str); } else { str = removeAllNonAlphaNumeric(str); } kws.add(str); } } } for (ClassifCode cc : dm.getBasicMetadata().getClassifCodeList()) { for (String s : cc.getValueList()) { ctgs.add(s); } } DataBag db = new DefaultDataBag(); for (String s : ctgs) { db.add(TupleFactory.getInstance().newTuple(s)); } return new Pair<String, DataBag>(Joiner.on(" ").join(kws), db); }