/** * @param ordinal * @param data */ private void findExtractedFields(int ordinal, String data) { List<FieldExtractor> extractors = schema.getEntity().getExtractorsForField(ordinal); for (FieldExtractor extractor : extractors) { String extField = extrtactedFields.get(extractor.getOrdinal()); if (null == extField || extField.isEmpty()) { String match = extractor.findMatch(data); if (null == match) { match = ""; } extrtactedFields.put(extractor.getOrdinal(), match); } } }
/* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context) */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] items = value.toString().split(fieldDelimRegex); itemList.clear(); for (int i = 0; i < items.length; ++i) { String item = items[i]; Field field = schema.getEntity().getFieldByOrdinal(i); if (null != field && field.getDataType().equals(Field.DATA_TYPE_TEXT)) { String format = field.getTextDataSubTypeFormat(); if (field.getDataSubType().equals(Field.TEXT_TYPE_PERSON_NAME)) { item = countryFormat.personNameFormat(item); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS)) { item = countryFormat.caseFormat(item, format); item = countryFormat.streetAddressFormat(item); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS_ONE)) { item = countryFormat.caseFormat(item, format); item = countryFormat.streetAddressOneFormat(item); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS_TWO)) { item = countryFormat.caseFormat(item, format); item = countryFormat.streetAddressTwoFormat(item); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_CITY)) { item = countryFormat.caseFormat(item, format); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STATE)) { item = countryFormat.stateFormat(item); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_ZIP)) { item = countryFormat.caseFormat(item, format); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_COUNTRY)) { item = countryFormat.caseFormat(item, format); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_EMAIL_ADDR)) { item = countryFormat.emailFormat(item, format); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_PHONE_NUM)) { item = countryFormat.phoneNumFormat(item, format); } else { // if text field analyze item = tokenize(item); } } itemList.add(item); } // build value string valueHolder.set(org.chombo.util.Utility.join(itemList, fieldDelim)); context.write(NullWritable.get(), valueHolder); }
/* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context) */ protected void setup(Context context) throws IOException, InterruptedException { fieldDelim = context.getConfiguration().get("field.delim", "[]"); fieldDelimRegex = context.getConfiguration().get("field.delim.regex", "\\[\\]"); consolidateFields = context.getConfiguration().getBoolean("consolidate.field", false); String textFields = context.getConfiguration().get("text.field.ordinals", ""); String[] items = textFields.toString().split(","); for (int i = 0; i < items.length; ++i) { textFieldOrdinals.add(Integer.parseInt(items[i])); } analyzer = new StandardAnalyzer(Version.LUCENE_35); Configuration conf = context.getConfiguration(); String filePath = conf.get("raw.schema.file.path"); FileSystem dfs = FileSystem.get(conf); Path src = new Path(filePath); FSDataInputStream fs = dfs.open(src); ObjectMapper mapper = new ObjectMapper(); schema = mapper.readValue(fs, SingleTypeSchema.class); for (Field field : schema.getEntity().getFields()) { retainedFieldOrdinals.add(field.getOrdinal()); } }