Beispiel #1
0
 /**
  * @param ordinal
  * @param data
  */
 private void findExtractedFields(int ordinal, String data) {
   List<FieldExtractor> extractors = schema.getEntity().getExtractorsForField(ordinal);
   for (FieldExtractor extractor : extractors) {
     String extField = extrtactedFields.get(extractor.getOrdinal());
     if (null == extField || extField.isEmpty()) {
       String match = extractor.findMatch(data);
       if (null == match) {
         match = "";
       }
       extrtactedFields.put(extractor.getOrdinal(), match);
     }
   }
 }
    /* (non-Javadoc)
     * @see org.apache.hadoop.mapreduce.Mapper#map(KEYIN, VALUEIN, org.apache.hadoop.mapreduce.Mapper.Context)
     */
    @Override
    protected void map(LongWritable key, Text value, Context context)
        throws IOException, InterruptedException {
      String[] items = value.toString().split(fieldDelimRegex);
      itemList.clear();

      for (int i = 0; i < items.length; ++i) {
        String item = items[i];
        Field field = schema.getEntity().getFieldByOrdinal(i);

        if (null != field && field.getDataType().equals(Field.DATA_TYPE_TEXT)) {
          String format = field.getTextDataSubTypeFormat();
          if (field.getDataSubType().equals(Field.TEXT_TYPE_PERSON_NAME)) {
            item = countryFormat.personNameFormat(item);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS)) {
            item = countryFormat.caseFormat(item, format);
            item = countryFormat.streetAddressFormat(item);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS_ONE)) {
            item = countryFormat.caseFormat(item, format);
            item = countryFormat.streetAddressOneFormat(item);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STREET_ADDRESS_TWO)) {
            item = countryFormat.caseFormat(item, format);
            item = countryFormat.streetAddressTwoFormat(item);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_CITY)) {
            item = countryFormat.caseFormat(item, format);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_STATE)) {
            item = countryFormat.stateFormat(item);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_ZIP)) {
            item = countryFormat.caseFormat(item, format);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_COUNTRY)) {
            item = countryFormat.caseFormat(item, format);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_EMAIL_ADDR)) {
            item = countryFormat.emailFormat(item, format);
          } else if (field.getDataSubType().equals(Field.TEXT_TYPE_PHONE_NUM)) {
            item = countryFormat.phoneNumFormat(item, format);
          } else {
            // if text field analyze
            item = tokenize(item);
          }
        }
        itemList.add(item);
      }

      // build value string
      valueHolder.set(org.chombo.util.Utility.join(itemList, fieldDelim));
      context.write(NullWritable.get(), valueHolder);
    }
Beispiel #3
0
    /* (non-Javadoc)
     * @see org.apache.hadoop.mapreduce.Mapper#setup(org.apache.hadoop.mapreduce.Mapper.Context)
     */
    protected void setup(Context context) throws IOException, InterruptedException {
      fieldDelim = context.getConfiguration().get("field.delim", "[]");
      fieldDelimRegex = context.getConfiguration().get("field.delim.regex", "\\[\\]");
      consolidateFields = context.getConfiguration().getBoolean("consolidate.field", false);
      String textFields = context.getConfiguration().get("text.field.ordinals", "");
      String[] items = textFields.toString().split(",");
      for (int i = 0; i < items.length; ++i) {
        textFieldOrdinals.add(Integer.parseInt(items[i]));
      }
      analyzer = new StandardAnalyzer(Version.LUCENE_35);

      Configuration conf = context.getConfiguration();
      String filePath = conf.get("raw.schema.file.path");
      FileSystem dfs = FileSystem.get(conf);
      Path src = new Path(filePath);
      FSDataInputStream fs = dfs.open(src);
      ObjectMapper mapper = new ObjectMapper();
      schema = mapper.readValue(fs, SingleTypeSchema.class);

      for (Field field : schema.getEntity().getFields()) {
        retainedFieldOrdinals.add(field.getOrdinal());
      }
    }