/** * Builds a mapping between the header for the incoming data to be scored and the header used to * train the model. Uses attribute names to match between the two. Also constructs a list of * missing attributes and a list of type mismatches. * * @param modelHeader the header of the data used to train the model * @param incomingHeader the header of the incoming data * @throws DistributedWekaException if more than 50% of the attributes expected by the model are * missing or have a type mismatch with the incoming data */ protected void buildAttributeMap(Instances modelHeader, Instances incomingHeader) throws DistributedWekaException { m_attributeMap = new int[modelHeader.numAttributes()]; int problemCount = 0; for (int i = 0; i < modelHeader.numAttributes(); i++) { Attribute modAtt = modelHeader.attribute(i); Attribute incomingAtt = incomingHeader.attribute(modAtt.name()); if (incomingAtt == null) { // missing model attribute m_attributeMap[i] = -1; m_missingMismatch.put(modAtt.name(), "missing from incoming data"); problemCount++; } else if (modAtt.type() != incomingAtt.type()) { // type mismatch m_attributeMap[i] = -1; m_missingMismatch.put( modAtt.name(), "type mismatch - " + "model: " + Attribute.typeToString(modAtt) + " != incoming: " + Attribute.typeToString(incomingAtt)); problemCount++; } else { m_attributeMap[i] = incomingAtt.index(); } } // -1 for the class (if set) int adjustForClass = modelHeader.classIndex() >= 0 ? 1 : 0; if (problemCount > (modelHeader.numAttributes() - adjustForClass) / 2) { throw new DistributedWekaException( "More than 50% of the attributes that the model " + "is expecting to see are either missing or have a type mismatch in the " + "incoming data."); } }
public MappingInfo(Instances dataSet, MiningSchema miningSchema, Logger log) throws Exception { m_log = log; // miningSchema.convertStringAttsToNominal(); Instances fieldsI = miningSchema.getMiningSchemaAsInstances(); m_fieldsMap = new int[fieldsI.numAttributes()]; m_nominalValueMaps = new int[fieldsI.numAttributes()][]; for (int i = 0; i < fieldsI.numAttributes(); i++) { String schemaAttName = fieldsI.attribute(i).name(); boolean found = false; for (int j = 0; j < dataSet.numAttributes(); j++) { if (dataSet.attribute(j).name().equals(schemaAttName)) { Attribute miningSchemaAtt = fieldsI.attribute(i); Attribute incomingAtt = dataSet.attribute(j); // check type match if (miningSchemaAtt.type() != incomingAtt.type()) { throw new Exception( "[MappingInfo] type mismatch for field " + schemaAttName + ". Mining schema type " + miningSchemaAtt.toString() + ". Incoming type " + incomingAtt.toString() + "."); } // check nominal values (number, names...) if (miningSchemaAtt.numValues() != incomingAtt.numValues()) { String warningString = "[MappingInfo] WARNING: incoming nominal attribute " + incomingAtt.name() + " does not have the same " + "number of values as the corresponding mining " + "schema attribute."; if (m_log != null) { m_log.logMessage(warningString); } else { System.err.println(warningString); } } if (miningSchemaAtt.isNominal() || miningSchemaAtt.isString()) { int[] valuesMap = new int[incomingAtt.numValues()]; for (int k = 0; k < incomingAtt.numValues(); k++) { String incomingNomVal = incomingAtt.value(k); int indexInSchema = miningSchemaAtt.indexOfValue(incomingNomVal); if (indexInSchema < 0) { String warningString = "[MappingInfo] WARNING: incoming nominal attribute " + incomingAtt.name() + " has value " + incomingNomVal + " that doesn't occur in the mining schema."; if (m_log != null) { m_log.logMessage(warningString); } else { System.err.println(warningString); } valuesMap[k] = UNKNOWN_NOMINAL_VALUE; } else { valuesMap[k] = indexInSchema; } } m_nominalValueMaps[i] = valuesMap; } /*if (miningSchemaAtt.isNominal()) { for (int k = 0; k < miningSchemaAtt.numValues(); k++) { if (!miningSchemaAtt.value(k).equals(incomingAtt.value(k))) { throw new Exception("[PMMLUtils] value " + k + " (" + miningSchemaAtt.value(k) + ") does not match " + "incoming value (" + incomingAtt.value(k) + ") for attribute " + miningSchemaAtt.name() + "."); } } }*/ found = true; m_fieldsMap[i] = j; } } if (!found) { throw new Exception( "[MappingInfo] Unable to find a match for mining schema " + "attribute " + schemaAttName + " in the " + "incoming instances!"); } } // check class attribute (if set) if (fieldsI.classIndex() >= 0) { if (dataSet.classIndex() < 0) { // first see if we can find a matching class String className = fieldsI.classAttribute().name(); Attribute classMatch = dataSet.attribute(className); if (classMatch == null) { throw new Exception( "[MappingInfo] Can't find match for target field " + className + "in incoming instances!"); } dataSet.setClass(classMatch); } else if (!fieldsI.classAttribute().name().equals(dataSet.classAttribute().name())) { throw new Exception( "[MappingInfo] class attribute in mining schema does not match " + "class attribute in incoming instances!"); } } // Set up the textual description of the mapping fieldsMappingString(fieldsI, dataSet); }