@Override public DDF loadFromJDBC(JDBCDataSourceDescriptor dataSource) throws DDFException { SparkDDFManager sparkDDFManager = (SparkDDFManager) mDDFManager; HiveContext sqlContext = sparkDDFManager.getHiveContext(); JDBCDataSourceCredentials cred = (JDBCDataSourceCredentials) dataSource.getDataSourceCredentials(); String fullURL = dataSource.getDataSourceUri().getUri().toString(); if (cred.getUsername() != null && !cred.getUsername().equals("")) { fullURL += String.format("?user=%s&password=%s", cred.getUsername(), cred.getPassword()); } Map<String, String> options = new HashMap<String, String>(); options.put("url", fullURL); options.put("dbtable", dataSource.getDbTable()); DataFrame df = sqlContext.load("jdbc", options); DDF ddf = sparkDDFManager.newDDF( sparkDDFManager, df, new Class<?>[] {DataFrame.class}, null, SparkUtils.schemaFromDataFrame(df)); // TODO? ddf.getRepresentationHandler().get(RDD.class, Row.class); ddf.getMetaDataHandler().setDataSourceDescriptor(dataSource); return ddf; }
@Override public DDF loadSpecialFormat(DataFormat format, URI fileURI, Boolean flatten) throws DDFException { SparkDDFManager sparkDDFManager = (SparkDDFManager) mDDFManager; HiveContext sqlContext = sparkDDFManager.getHiveContext(); DataFrame jdf = null; switch (format) { case JSON: jdf = sqlContext.jsonFile(fileURI.toString()); break; case PQT: jdf = sqlContext.parquetFile(fileURI.toString()); break; default: throw new DDFException(String.format("Unsupported data format: %s", format.toString())); } DataFrame df = SparkUtils.getDataFrameWithValidColnames(jdf); DDF ddf = sparkDDFManager.newDDF( sparkDDFManager, df, new Class<?>[] {DataFrame.class}, null, SparkUtils.schemaFromDataFrame(df)); if (flatten == true) return ddf.getFlattenedDDF(); else return ddf; }
public double rmse(DDF predictedDDF, boolean implicitPrefs) throws DDFException { RDD<Rating> predictions = (RDD<Rating>) predictedDDF.getRepresentationHandler().get(RDD.class, Rating.class); RDD<Rating> ratings = (RDD<Rating>) this.getDDF().getRepresentationHandler().get(RDD.class, Rating.class); return new ROCComputer().computeRmse(ratings, predictions, false); }
@Override public DDF loadTextFile(DataSourceDescriptor dataSource) throws DDFException { String hiveTableName = UUID.randomUUID().toString().replace("-", "_"); StringBuilder stringBuilder = new StringBuilder(); List<String> columnNames = new ArrayList<>(); List<Schema.Column> columnList = dataSource.getDataSourceSchema().getColumns(); for (int i = 0; i < columnList.size(); ++i) { columnNames.add(columnList.get(i).getName()); if (i == 0) { stringBuilder.append(columnList.get(i).getName() + " " + columnList.get(i).getType()); } else { stringBuilder.append( ", " + columnList.get(i).getName() + " " + columnList.get(i).getType()); } } String schemaStr = stringBuilder.toString(); TextFileFormat textFileFormat = (TextFileFormat) (dataSource.getFileFormat()); String quote = textFileFormat.getQuote(); String delimiter = textFileFormat.getDelimiter(); String serdesString = "ROW FORMAT SERDE 'com.bizo.hive.serde.csv.CSVSerde' " + "WITH serdeproperties ('separatorChar' = '" + delimiter + "', 'quoteChar' = '" + quote + "')"; URI uri = dataSource.getDataSourceUri().getUri(); String sqlCmd = "create external table " + hiveTableName + " (" + schemaStr + ") " + serdesString + " STORED AS TEXTFILE LOCATION '" + uri.toString() + "'"; this.mDDFManager.sql(sqlCmd, false); DDF ddf = this.mDDFManager.sql2ddf(String.format("select * from %s", hiveTableName), false); ddf.setColumnNames(columnNames); return ddf; }
@Override /* * input expected RDD[double[][]] * (non-Javadoc) * @see io.ddf.ml.AMLMetricsSupporter#roc(io.ddf.DDF, int) */ public RocMetric roc(DDF predictionDDF, int alpha_length) throws DDFException { RDD<LabeledPoint> rddLabeledPoint = (RDD<LabeledPoint>) predictionDDF.getRepresentationHandler().get(RDD.class, LabeledPoint.class); ROCComputer rc = new ROCComputer(); return (rc.ROC(rddLabeledPoint, alpha_length)); }