/* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context) */ protected void setup(Context context) throws IOException, InterruptedException { Configuration config = context.getConfiguration(); fieldDelim = config.get("field.delim", "[]"); fieldDelimRegex = config.get("field.delim.regex", "\\[\\]"); // country specific format String country = config.get("text.country", "United States"); countryFormat = CountryStandardFormat.createCountryStandardFormat(country, textNormalizer); // language specific analyzer String lang = config.get("text.language", "en"); createAnalyzer(lang); // load schema String filePath = config.get("raw.schema.file.path"); FileSystem dfs = FileSystem.get(config); Path src = new Path(filePath); FSDataInputStream fs = dfs.open(src); ObjectMapper mapper = new ObjectMapper(); schema = mapper.readValue(fs, SingleTypeSchema.class); }
/* (non-Javadoc) * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context) */ @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String[] items = value.toString().split(fieldDelimRegex); itemList.clear(); for (int i = 0; i < items.length; ++i) { String item = items[i]; Field field = schema.getEntity().getFieldByOrdinal(i); if (null != field && field.getDataType().equals(Field.DATA_TYPE_TEXT)) { String format = field.getTextDataSubTypeFormat(); if (field.getDataSubType().equals(Field.TEXT_TYPE_PERSON_NAME)) { item = countryFormat.personNameFormat(item); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS)) { item = countryFormat.caseFormat(item, format); item = countryFormat.streetAddressFormat(item); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS_ONE)) { item = countryFormat.caseFormat(item, format); item = countryFormat.streetAddressOneFormat(item); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS_TWO)) { item = countryFormat.caseFormat(item, format); item = countryFormat.streetAddressTwoFormat(item); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_CITY)) { item = countryFormat.caseFormat(item, format); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STATE)) { item = countryFormat.stateFormat(item); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_ZIP)) { item = countryFormat.caseFormat(item, format); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_COUNTRY)) { item = countryFormat.caseFormat(item, format); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_EMAIL_ADDR)) { item = countryFormat.emailFormat(item, format); } else if (field.getDataSubType().equals(Field.TEXT_TYPE_PHONE_NUM)) { item = countryFormat.phoneNumFormat(item, format); } else { // if text field analyze item = tokenize(item); } } itemList.add(item); } // build value string valueHolder.set(org.chombo.util.Utility.join(itemList, fieldDelim)); context.write(NullWritable.get(), valueHolder); }