@Override protected SimpleFeatureType buildFeatureType() { String[] headers; Map<String, Class<?>> typesFromData; CsvReader csvReader = null; try { csvReader = csvFileState.openCSVReader(); headers = csvReader.getHeaders(); typesFromData = CSVStrategy.findMostSpecificTypesFromData(csvReader, headers); } catch (IOException e) { throw new RuntimeException(e); } finally { if (csvReader != null) { csvReader.close(); } } SimpleFeatureTypeBuilder builder = CSVStrategy.createBuilder(csvFileState, headers, typesFromData); Class<?> latClass = typesFromData.get(latField); Class<?> lngClass = typesFromData.get(lngField); if (CSVStrategy.isNumeric(latClass) && CSVStrategy.isNumeric(lngClass)) { List<String> csvHeaders = Arrays.asList(headers); int index = csvHeaders.indexOf(latField); AttributeTypeBuilder builder2 = new AttributeTypeBuilder(); builder2.setCRS(DefaultGeographicCRS.WGS84); builder2.binding(Point.class); AttributeDescriptor descriptor = builder2.buildDescriptor(pointField); builder.add(index, descriptor); builder.remove(latField); builder.remove(lngField); } return builder.buildFeatureType(); }
/** * Updates the table of quotes for this symbol. Assumes that the listofsymbols has been updated, * but the table itself may not exist. Takes a date range, including both start and end days. * * <p>Yahoo Finance returns an error message rather than an empty CSV if the start and end dates * are today. The caller is responsible for checking that the call range is acceptable. * * @param symbol - symbol to update * @param startDate - beginning of range to add to * @param endDate - end of range to add to */ static void updateSymbol(String symbol, Date startDate, Date endDate) throws Exception { System.out.println("Trying to update:" + symbol); Connection conn = initialize(); Statement stat = conn.createStatement(); URL data = YahooCsvDownloadUrl(symbol, startDate, endDate); BufferedReader in = null; try { in = new BufferedReader(new InputStreamReader(data.openStream())); } catch (java.io.FileNotFoundException e) { System.out.println("Symbol not found:" + symbol); e.printStackTrace(); return; } CsvReader reader = new CsvReader(in); reader.readHeaders(); String[] headers = reader.getHeaders(); stat.executeUpdate("CREATE TABLE IF NOT EXISTS " + symbol + " (" + getColNames(headers) + ");"); String statement = "INSERT INTO " + symbol + " (" + getColNames(headers) + ") VALUES (" + getQueryQuestionMarks(headers) + ");"; PreparedStatement prep = conn.prepareStatement(statement); while (reader.readRecord()) { for (int j = 0; j < headers.length; j++) { String str = reader.get(headers[j]); prep.setString(j + 1, str); } // TODO: salim, what's the point of these calls? prep.addBatch(); conn.setAutoCommit(false); prep.executeBatch(); conn.setAutoCommit(true); } reader.close(); in.close(); conn.close(); }
/** Sets up all the GEPSymbolSet symbols, loading them from the parameter file. */ public void setup( final EvolutionState state, final Parameter base, final Parameter def, GEPSpecies species) { // Name of file with the terminal (variable) definitions and training values String terminalFilename; // Name of file with the test data values if specified String testingTerminalFilename; // keep track of the maximum arity of any function maxArity = 0; // What's my name? Don't really use this at this time ... name = state.parameters.getString(base.push(P_NAME), def.push(P_NAME)); if (name == null || name.equals("")) state.output.warning( "No name was given for this GEP symbol set...not required at this time.", base.push(P_NAME), def.push(P_NAME)); // How many functions do I have? numberOfFunctions = state.parameters.getInt(base.push(P_FUNCTIONSIZE), def.push(P_FUNCTIONSIZE), 1); numberOfSymbols = numberOfFunctions; // How many terminals do I have? Check for a data file first ... // if time series problem type and using raw time series data then // number of terminals will be specified in the embedding dimension value // provided in the parameter file // else if a file specified // get the 1st line of the file and count the fields in it (#terminals is number of fields // minus // the number of chromosomes/dependent variables) // else // use the number of terminals specified in the parameter file terminalFilename = state.parameters.getStringWithDefault( base.push(P_TERMINALFILENAME), def.push(P_TERMINALFILENAME), ""); testingTerminalFilename = state.parameters.getStringWithDefault( base.push(P_TESTINGTERMINALFILENAME), def.push(P_TESTINGTERMINALFILENAME), ""); String terminalSymbolsfromFile[] = null; CsvReader terminalFileCSV = null; CsvReader testingTerminalFileCSV = null; // Are we processing raw time series data? boolean timeseriesWithRawDataValues = species.problemType == GEPSpecies.PT_TIMESERIES && species.timeseriesEmbeddingDimension > 0; if (!terminalFilename.equals("")) { String defaultTerminalFileSeparator = ","; // default field separator is comma try { // allow for gzip files .... end with .gz or .gzip\ if (terminalFilename.endsWith(".gz") || terminalFilename.endsWith(".gzip")) { terminalFileCSV = new CsvReader( (InputStream) (new GZIPInputStream(new FileInputStream(terminalFilename))), Charset.forName("ISO-8859-1")); // set terminal file name to be the one with gzip or gz removed from the end if (terminalFilename.endsWith(".gz")) terminalFilename = terminalFilename.substring(0, terminalFilename.length() - 3); else terminalFilename = terminalFilename.substring(0, terminalFilename.length() - 5); } else terminalFileCSV = new CsvReader(terminalFilename); } catch (FileNotFoundException e) { state.output.fatal( "The file with terminal definitions and/or values (" + terminalFilename + ") could not be found", base.push(P_TERMINALFILENAME), def.push(P_TERMINALFILENAME)); } catch (IOException e) { state.output.fatal( "The file with terminal definitions and/or values (" + terminalFilename + ") could not be found or the expected GZIP file could nor be opened", base.push(P_TERMINALFILENAME), def.push(P_TERMINALFILENAME)); } // if filename has extension .dat it is space delimited, if .csv (or anything else // for that matter) it is comma delimited // (separator can still be changed with the terminalfileseparator parameter) if (terminalFilename.endsWith(".dat")) defaultTerminalFileSeparator = "space"; // if using a file for the terminals and their values then check for a non-default separator String terminalFileSeparator = state.parameters.getStringWithDefault( base.push(P_TERMINALFILESEPARATOR), def.push(P_TERMINALFILESEPARATOR), defaultTerminalFileSeparator); if (terminalFileSeparator.toLowerCase().equals("comma")) terminalFileSeparator = ","; else if (terminalFileSeparator == "\\t" || terminalFileSeparator.toLowerCase().equals("tab")) terminalFileSeparator = "\t"; else if (terminalFileSeparator == "space") terminalFileSeparator = " "; terminalFileCSV.setDelimiter(terminalFileSeparator.charAt(0)); // let's check for a testing data file at this time as well .. if no file for // names and training data no need to worry about this one. if (!testingTerminalFilename.equals("")) { try { // allow for gzip files .... end with .gz or .gzip\ if (testingTerminalFilename.endsWith(".gz") || testingTerminalFilename.endsWith(".gzip")) testingTerminalFileCSV = new CsvReader( (InputStream) (new GZIPInputStream(new FileInputStream(testingTerminalFilename))), Charset.forName("ISO-8859-1")); else testingTerminalFileCSV = new CsvReader(testingTerminalFilename); testingTerminalFileCSV.setDelimiter(terminalFileSeparator.charAt(0)); } catch (FileNotFoundException e) { state.output.fatal( "The file with testing data values (" + testingTerminalFilename + ") could not be found", base.push(P_TERMINALFILENAME), def.push(P_TERMINALFILENAME)); } catch (IOException e) { state.output.fatal( "The file with testing data values (" + terminalFilename + ") could not be found or the expected GZIP file could nor be opened", base.push(P_TERMINALFILENAME), def.push(P_TERMINALFILENAME)); } } } if (timeseriesWithRawDataValues) numberOfTerminals = species.timeseriesEmbeddingDimension; else if (terminalFileCSV != null) { // get the terminal symbols for the independent and dependent variables try { terminalFileCSV.readHeaders(); terminalSymbolsfromFile = terminalFileCSV.getHeaders(); } catch (IOException e) { state.output.fatal( "The file with variable (terminal) definitions and values (" + terminalFilename + ") failed to read the headers" + e, base.push(P_TERMINALFILENAME), def.push(P_TERMINALFILENAME)); } // 1 less for each dependent variable (number of chromosomes) at the end numberOfTerminals = terminalSymbolsfromFile.length - species.numberOfChromosomes; if (numberOfTerminals < 1) state.output.fatal( "The file with terminal definitions and data values (" + terminalFilename + ") has no independent variables specified in record 1", base.push(P_TERMINALFILENAME), def.push(P_TERMINALFILENAME)); // if using a file for the terminals and their values then check for a non-default separator } else { numberOfTerminals = state.parameters.getInt(base.push(P_TERMINALSIZE), def.push(P_TERMINALSIZE), 1); } numberOfSymbols += numberOfTerminals; if (numberOfSymbols < 1) state.output.error( "The GEPSymbolSet \"" + name + "\" have at least 1 terminal symbol defined.", base.push(P_TERMINALSIZE), def.push(P_TERMINALSIZE)); // add a special Symbol for constants if we are using them ... it will be added to the // end of the array of symbols! if (species.useConstants) { numberOfTerminals++; // special constant terminal numberOfSymbols++; } symbols = new GEPSymbol[numberOfSymbols]; int numberOfSymbolsWithoutConstantSymbol = numberOfSymbols; if (species.useConstants) // add the constant terminal symbol to the end { symbols[numberOfSymbols - 1] = (GEPSymbol) (new GEPConstantTerminalSymbol()); symbols[numberOfSymbols - 1].id = numberOfSymbols - 1; numberOfSymbolsWithoutConstantSymbol--; } Parameter pTerminal = base.push(P_TERMINAL); Parameter pdefTerminal = def.push(P_TERMINAL); Parameter pFunction = base.push(P_FUNCTION); Parameter pdefFunction = def.push(P_FUNCTION); // create hashtable of names of terminals and hash table with names of functions // so we can easily check that they are not duplicates Hashtable functionHT = new Hashtable(); Hashtable terminalHT = new Hashtable(); // process the functions for (int x = 0; x < numberOfFunctions; x++) { Parameter pp = pFunction.push("" + x); Parameter ppdef = pdefFunction.push("" + x); String function = state.parameters.getStringWithDefault(pp, ppdef, ""); if (function.equals("")) // no name for the function state.output.fatal("Invalid function specifier: '" + function + "'", pp, ppdef); // make sure not specifying the same function more than once if (functionHT.get(function) != null) state.output.fatal( "Function '" + function + "' was specified more than once in list of function symbols"); else functionHT.put(function, function); GEPFunctionSymbol fs = null; try { Class classDefinition = Class.forName(LOCATION_OF_FUNCTION_CLASSES + "." + function); fs = (GEPFunctionSymbol) classDefinition.newInstance(); } catch (InstantiationException e) { state.output.fatal( "Unable to create GEPFunctionSymbol class for function '" + function + "'. " + e); } catch (IllegalAccessException e) { state.output.fatal( "Unable to create GEPFunctionSymbol class for function '" + function + "' " + e); } catch (ClassNotFoundException e) { state.output.fatal( "Unable to create GEPFunctionSymbol class for function '" + function + "' " + e); } // if using a logical function must be a logical problem if (fs.isLogicalFunction() && (species.problemType != GEPSpecies.PT_LOGICAL)) state.output.fatal( "Can only use logical functions with a logical problem type. Function " + function + " is a logical function.", pp, ppdef); // if using a numerical function must be an non logical problem if (!fs.isLogicalFunction() && (species.problemType == GEPSpecies.PT_LOGICAL)) state.output.fatal( "Can only use logical functions with a non logical problem type. Function " + function + " is a numerical function.", pp, ppdef); symbols[x] = (GEPSymbol) fs; // symbols[x].setup(state, base); if (fs.arity < 1) state.output.fatal("Arity must be > 0 for a GEPTerminalSymbol)", pp, ppdef); symbols[x].id = x; int weight = state.parameters.getInt(pp.push(P_FUNCTIONWEIGHT), ppdef.push(P_FUNCTIONWEIGHT), 1); if (weight < 1) { state.output.warning( "Weight for GEP Function must be > 0; defaulting to 1)", pp.push(P_FUNCTIONWEIGHT), ppdef.push(P_FUNCTIONWEIGHT)); weight = 1; } symbols[x].weight = weight; if (symbols[x].arity > maxArity) maxArity = symbols[x].arity; } // process the terminals ... defined by default for timeseries data, in the // CSV file if specified and not timeseries, or in the params file if neither of those. for (int x = numberOfFunctions; x < numberOfSymbolsWithoutConstantSymbol; x++) { // load the terminal symbols int index = x - numberOfFunctions; String terminal = ""; if (timeseriesWithRawDataValues) { // terminals get default names v0, v1, v2, v3, ... vn-1 terminal = "v" + index; } else if (terminalFileCSV == null) // terminals defined in param file { Parameter pp = pTerminal.push("" + index); Parameter ppdef = pdefTerminal.push("" + index); terminal = state.parameters.getStringWithDefault(pp, ppdef, ""); } else { // terminals defined in CSV file terminal = terminalSymbolsfromFile[index]; } if (terminal.equals("")) // no name for the terminal state.output.fatal("Invalid terminal specifier: '" + terminal + "' for terminal # " + index); // make sure not specifying the same function more than once if (terminalHT.get(terminal) != null) state.output.fatal( "Terminal symbol (indep var) '" + terminal + "' was specified more than once in list of terminal symbols (independent variables)"); else terminalHT.put(terminal, terminal); GEPTerminalSymbol ts = new GEPTerminalSymbol(terminal, this); symbols[x] = (GEPSymbol) ts; // symbols[x].setup(state, base); if (ts.arity != 0) // cannot happen state.output.fatal("Arity must be exactly 0 for a GEPTerminalSymbol)"); symbols[x].id = x; symbols[x].weight = 1; // all Terminal symbols have weight of 1 } // must be at least 1 Terminal symbol in the SymbolSet. // If not then the user didn't specify the terminals in the param file or in the data file if (numberOfTerminals < 1) state.output.fatal( "Must be at least one Terminal Symbol in the set of GEPSymbols\n" + "Either did not specify the terminal symbols in the param file or\n" + "did not specify the appropriate data file with the terminals specified in the first line."); // collect the id's (indices) of the terminal and function symbols that // are in the set of symbols terminals = new int[numberOfTerminals]; int terminalNum = 0; functions = new int[numberOfFunctions]; int functionNum = 0; for (int x = 0; x < numberOfSymbols; x++) { if (symbols[x] instanceof GEPConstantTerminalSymbol) terminals[terminalNum++] = x; else if (symbols[x] instanceof GEPTerminalSymbol) terminals[terminalNum++] = x; else if (symbols[x] instanceof GEPFunctionSymbol) functions[functionNum++] = x; } // collect the weights for symbols and terminals and normalize and cumulate them. // Then we can use these arrays to pick appropriate symbols or terminals according to // their weights ... using the RandomChooser.PickFromDistribution cumulativeNormalizedSymbolWeights = new float[numberOfSymbols]; cumulativeNormalizedTerminalWeights = new float[numberOfTerminals]; cumulativeNormalizedFunctionWeights = new float[numberOfFunctions]; int j = 0, k = 0; for (int i = 0; i < numberOfSymbols; i++) { float weight = (float) (symbols[i].weight); cumulativeNormalizedSymbolWeights[i] = weight; if (symbols[i] instanceof GEPTerminalSymbol || symbols[i] instanceof GEPConstantTerminalSymbol) cumulativeNormalizedTerminalWeights[j++] = weight; if (symbols[i] instanceof GEPFunctionSymbol) cumulativeNormalizedFunctionWeights[k++] = weight; } RandomChoice.organizeDistribution(cumulativeNormalizedSymbolWeights); RandomChoice.organizeDistribution(cumulativeNormalizedTerminalWeights); RandomChoice.organizeDistribution(cumulativeNormalizedFunctionWeights); // use the 2/3 rule if fewer functions else the 1/2 rule (don't count the constant // terminal here) if (numberOfFunctions < (numberOfTerminals - (species.useConstants ? 1 : 0))) probabilityOfChoosingFunction = 2.0 / 3.0; else probabilityOfChoosingFunction = 0.5; // ... and finally get the training and testing data values for the terminals and dependent // variable // and put them into the Terminal instances (creating a 'special' Terminal Symbol to // hold the dependent variable training and testing values) // If this is a time series problem AND we are using the raw time series data then // we named the terminals v1, v2, ..., nn where n is the number of independent // variables as specified in the embedding dimension (which) was used to // determine the number of terminals. But we have to process the time series data // to get the values for each terminal ... get the raw data from the CSV file // if specified or from the user program ... then process it into rows of data // representing the independent variables and the dependent variable. // // timeseries-delay -- if 1 uses each time series value, if 2 uses every other one, etc. // timeseries-embeddingdimension -- determines the number of timeseries points to use // as independent variables when transforming the set of time series data. Another // data point is used as the dependent variable value. So the time series 'raw' data // consisting of a list of single values is processed by splitting the data into // groups (rows) of size embeddingdimension+1. From the end of the time series data // embeddingdimension+1 values are chosen (if delay is 1 all values are chosen, if // 2 every other one is chosen). The last value is the independent variable value. // Then the next row is selected by moving 'delay' // values from the end and chosing embeddingdimension+1 values. This is repeated // until no more sets of size embeddingdimension+1 can be chosen. If this produces // n sets of data then testingprediction of them are used for testing and // (n - testingpredictions) are used for training. // // So if we had the data: // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 // and delay was 1 and embeddingdimension was 4 then we'd process the set into // the following 17 data sets. If testingpredictions was 6 then the 1st 11 // would be used for training and the last 6 for testing // iv1 iv2 iv3 iv4 dv // 1 2 3 4 5 // 2 3 4 5 6 // 3 4 5 6 7 // . . . // 14 15 16 17 18 // 15 16 17 18 19 // 16 17 18 19 20 // 17 18 19 20 21 // If delay was 2 then 7 sets would be formed as: // iv1 iv2 iv3 iv4 dv // 1 3 5 7 9 // 3 5 7 9 11 // . . . // 9 11 13 15 17 // 11 13 15 17 19 // 13 15 17 19 21 // timeseries-testingpredictions -- specifies the number of sets of data to devote to testing if (timeseriesWithRawDataValues) { GEPDependentVariable.symbol = "dependentVariable"; double rawTimeSeriesValues[] = null; if (terminalFileCSV == null) rawTimeSeriesValues = ((GEPProblem) state.evaluator.p_problem).getTimeSeriesDataValues(); else rawTimeSeriesValues = getRawTimeSeriesValuesFromCSVfile(state, terminalFileCSV); if (rawTimeSeriesValues == null) state.output.fatal("Unable to get time series data values from User Program or CSV file"); Vector values[] = processRawTimeSeriesValues(state, species, rawTimeSeriesValues); // have an array of vectors; 1 vector for each indep variable and the dep variable(s) for (int i = 0; i < values.length; i++) { // get the values for training ... and testing (specified by timeseriesTestingPredictions) int sizeOfTrainingData = values[i].size() - species.timeseriesTestingPredictions; double v[] = new double[sizeOfTrainingData]; double testingV[] = new double[species.timeseriesTestingPredictions]; for (int m = 0; m < v.length; m++) v[m] = ((Double) values[i].elementAt(m)).doubleValue(); for (int n = 0; n < testingV.length; n++) testingV[n] = ((Double) values[i].elementAt(n + sizeOfTrainingData)).doubleValue(); int depVarIndex = i - values.length + species.numberOfChromosomes; if (depVarIndex >= 0) // last column(s) in file is(are) the dependent variable(s) { GEPDependentVariable.trainingData.setValues(v, depVarIndex); GEPDependentVariable.testingData.setValues(testingV, depVarIndex); } else { ((GEPTerminalSymbol) symbols[numberOfFunctions + i]).setTrainingValues(v); ((GEPTerminalSymbol) symbols[numberOfFunctions + i]).setTestingValues(testingV); } } } // else If there is a file with the terminals and dep variable(s) use this else ask for // the values from the User Program (problem). else if (terminalFileCSV != null) // terminals defined in CSV file { GEPDependentVariable.symbol = terminalSymbolsfromFile[terminalSymbolsfromFile.length - 1]; // get all the values into an array of vectors (each vector holds the values for a // single terminal (dep or indep variable) Vector values[] = new Vector[terminalSymbolsfromFile.length]; for (int i = 0; i < terminalSymbolsfromFile.length; i++) values[i] = new Vector(); try { while (terminalFileCSV.readRecord()) { for (int i = 0; i < terminalSymbolsfromFile.length; i++) values[i].add(terminalFileCSV.get(i)); } } catch (IOException e) { state.output.fatal( "The file with terminal definitions/values failed when reading records. " + e); } for (int i = 0; i < terminalSymbolsfromFile.length; i++) { double v[] = new double[values[i].size()]; for (int m = 0; m < v.length; m++) try { v[m] = Double.parseDouble((String) values[i].elementAt(m)); } catch (Exception e) { state.output.fatal( "Failed trying to read a training data set value. The field is supposed to be a number but was the string '" + (String) values[i].elementAt(m) + "'.\n" + e); } int jj = terminalSymbolsfromFile.length - species.numberOfChromosomes; if (i >= jj) // last column(s) in file is(are) the dependent variable(s) GEPDependentVariable.trainingData.setValues(v, i - jj); else ((GEPTerminalSymbol) symbols[numberOfFunctions + i]).setTrainingValues(v); } // get the testing data as well if a file was specified if (testingTerminalFileCSV != null) // testing data defined in CSV file { // get all the values into an array of vectors (each vector holds the values for a // single terminal (dep or indep variable) Vector testingValues[] = new Vector[terminalSymbolsfromFile.length]; for (int i = 0; i < terminalSymbolsfromFile.length; i++) testingValues[i] = new Vector(); try { while (testingTerminalFileCSV.readRecord()) { for (int i = 0; i < terminalSymbolsfromFile.length; i++) testingValues[i].add(testingTerminalFileCSV.get(i)); } } catch (IOException e) { state.output.fatal( "The file with testing data values failed when reading records. " + "\nMake sure the file has the same column separators as the testing data file." + "\nAlso check that it has the same as the number of columns as the testing file" + e); } for (int i = 0; i < terminalSymbolsfromFile.length; i++) { double v[] = new double[testingValues[i].size()]; for (int m = 0; m < v.length; m++) try { v[m] = Double.parseDouble((String) testingValues[i].elementAt(m)); } catch (Exception e) { state.output.fatal( "Failed trying to read a testing data set value. The field is supposed to be a number but was the string '" + (String) testingValues[i].elementAt(m) + "'.\n" + e); } int jj = terminalSymbolsfromFile.length - species.numberOfChromosomes; if (i >= jj) // last column(s) in file is(are) the dependent variable(s) GEPDependentVariable.testingData.setValues(v, i - jj); else ((GEPTerminalSymbol) symbols[numberOfFunctions + i]).setTestingValues(v); } } } // else terminals were defined in the param file and no CSV file // defined so .... ask User Problem for the values, training and testing (if there are any) else { GEPDependentVariable.symbol = "dependentVariable"; GEPProblem prob = (GEPProblem) state.evaluator.p_problem; double vals[] = null; for (int i = numberOfFunctions; i < numberOfSymbolsWithoutConstantSymbol; i++) { GEPTerminalSymbol ts = (GEPTerminalSymbol) symbols[i]; vals = prob.getDataValues(ts.symbol); if (vals == null) state.output.fatal( "Expecting user problem (GEPProblem/ProblemForm) to supply training data values for terminal symbol '" + ts + "'."); ts.setTrainingValues(vals); vals = prob.getTestingDataValues(ts.symbol); if (vals != null) // don't have to supply testing data ts.setTestingValues(vals); } // if just one dep var then ask user by requesting with getdataValues("dependentVariable") // and if more than one dep var (more than 1 chromosome) then ask for dep variables // with getDataValues("dependentVariable0"), getDataValues("dependentVariable1"), ... for (int i = 0; i < species.numberOfChromosomes; i++) { String depVarSym = GEPDependentVariable.symbol; if (species.numberOfChromosomes > 1) depVarSym = depVarSym + i; vals = prob.getDataValues(depVarSym); if (vals == null) state.output.fatal( "Expecting user problem (GEPProblem/ProblemForm) to supply training data values for dependent variable '" + depVarSym + "'."); GEPDependentVariable.trainingData.setValues(vals, i); vals = prob.getTestingDataValues(depVarSym); if (vals != null) // don't have to supply testing data GEPDependentVariable.testingData.setValues(vals, i); } } // Some checking of data values to ensure they meet the requirements for the various problem // types. // For all problem types need to make sure all indep vars and the dep var have the same number // of values! int numValues = GEPDependentVariable.trainingData.values[0].length; for (int i = numberOfFunctions; i < numberOfSymbolsWithoutConstantSymbol; i++) if (((GEPTerminalSymbol) symbols[i]).trainingValues.length != numValues) state.output.fatal( "Must have same number of values for all independent variables and the dependent variable." + "/nNumber of values for Dependent Variable is: " + numValues + "/nNumber of values for Independent Variable '" + symbols[i].symbol + "' is: " + ((GEPTerminalSymbol) symbols[i]).trainingValues.length); // For Classification and logical problems all dependent variable values must be either 0 or 1 if (species.problemType == GEPSpecies.PT_CLASSIFICATION || species.problemType == GEPSpecies.PT_LOGICAL) { double dvVals[] = GEPDependentVariable.trainingData.values[0]; for (int i = 0; i < numValues; i++) if (dvVals[i] != 0.0 && dvVals[i] != 1.0) state.output.fatal( "For classification/logical problems all dependent variable values must be either 1 or 0.\nFound value " + dvVals[i] + " at index " + i + "in the values."); } // For Logical problems all independent variable values must be 0 or 1 if (species.problemType == GEPSpecies.PT_LOGICAL) { // for each indep variable symbol for (int i = numberOfFunctions; i < numberOfSymbolsWithoutConstantSymbol; i++) { double ivVals[] = ((GEPTerminalSymbol) symbols[i]).trainingValues; for (int m = 0; m < numValues; m++) if (ivVals[m] != 0.0 && ivVals[m] != 1.0) state.output.fatal( "For logical problems all independent variable values must be either 1 or 0.\nFound value " + ivVals[m] + " at index '" + m + "' in the variable '" + ((GEPTerminalSymbol) symbols[i]).symbol + "'."); } } state.output.exitIfErrors(); }
public StringBuffer uploadAndReportCustomDataFile( InputStream inputStream, long size, String fileFormat, char delimChar, List<String> listOfUIDsToUpdate, CustomFieldGroup customFieldGroup, PhenoCollection phenoCollection, boolean overwriteExisting) throws FileFormatException, ArkSystemException { List<PhenoCollection> phenoCollectionsWithTheirDataToInsert = new ArrayList<PhenoCollection>(); delimiterCharacter = delimChar; uploadReport = new StringBuffer(); InputStream convertedInputStream; if (fileFormat.equalsIgnoreCase(Constants.FileFormat.XLS.toString())) { XLStoCSV xlsToCSV = new XLStoCSV(delimiterCharacter); convertedInputStream = xlsToCSV.convertXlsInputStreamToCsv(inputStream); } else { convertedInputStream = inputStream; } InputStreamReader inputStreamReader = null; CsvReader csvReader = null; DecimalFormat decimalFormat = new DecimalFormat("0.00"); int subjectCount = 0; long updateFieldsCount = 0L; long insertFieldsCount = 0L; long emptyDataCount = 0L; try { inputStreamReader = new InputStreamReader(convertedInputStream); csvReader = new CsvReader(inputStreamReader, delimiterCharacter); String[] stringLineArray; List<LinkSubjectStudy> allSubjectWhichWillBeUpdated = null; if (listOfUIDsToUpdate.size() > 0) { allSubjectWhichWillBeUpdated = iArkCommonService.getUniqueSubjectsWithTheseUIDs(study, listOfUIDsToUpdate); } else { allSubjectWhichWillBeUpdated = new ArrayList<LinkSubjectStudy>(); } if (size <= 0) { uploadReport.append( "ERROR: The input size was not greater than 0. Actual length reported: "); uploadReport.append(size); uploadReport.append("\n"); throw new FileFormatException( "The input size was not greater than 0. Actual length reported: " + size); } csvReader.readHeaders(); List<String> fieldNameCollection = Arrays.asList(csvReader.getHeaders()); ArkFunction phenoCustomFieldArkFunction = iArkCommonService.getArkFunctionByName( Constants.FUNCTION_KEY_VALUE_PHENO_COLLECTION); // "); List<CustomFieldDisplay> cfdsThatWeNeed = iArkCommonService.getCustomFieldDisplaysIn( fieldNameCollection, study, phenoCustomFieldArkFunction, customFieldGroup); // Paul has requested - in pheno we only insert List<PhenoData> dataThatWeHave = // iArkCommonService.getCustomFieldDataFor(cfdsThatWeNeed, allSubjectWhichWillBeUpdated); // read one line which contains potentially many custom fields QuestionnaireStatus uploadingStatus = iPhenotypicService.getPhenoCollectionStatusByName( Constants.PHENO_COLLECTION_STATUS_UPLOADED); while (csvReader.readRecord()) { List<PhenoData> phenoDataToInsertForThisPhenoCollection = new ArrayList<PhenoData>(); log.info("reading record " + subjectCount); stringLineArray = csvReader.getValues(); String subjectUID = stringLineArray[0]; String recordDate = stringLineArray[1]; Date recordDate_asDate = (recordDate.isEmpty() ? new Date() : simpleDateFormat.parse(recordDate)); LinkSubjectStudy subject = getSubjectByUIDFromExistList(allSubjectWhichWillBeUpdated, subjectUID); // log.info("get subject from list"); CustomField customField = null; List<PhenoCollection> subjectExistingMatchingPhenoCollections = iPhenotypicService.getSubjectMatchingPhenoCollections( subject, customFieldGroup, recordDate_asDate); PhenoCollection phenoCollectionIntoDB = new PhenoCollection(); if (subjectExistingMatchingPhenoCollections.size() == 0 || !overwriteExisting) { phenoCollectionIntoDB.setDescription(phenoCollection.getDescription()); phenoCollectionIntoDB.setLinkSubjectStudy(subject); // phenoCollectionIntoDB.setName(phenoCollection.getName()); phenoCollectionIntoDB.setQuestionnaire(customFieldGroup); if (recordDate.isEmpty()) { phenoCollectionIntoDB.setRecordDate(new Date()); } else { phenoCollectionIntoDB.setRecordDate(recordDate_asDate); } phenoCollectionIntoDB.setStatus( uploadingStatus); // TODO for this to be UPLOADED TYPE STATUS } else { if (subjectExistingMatchingPhenoCollections.size() == 1) { recordDate_asDate = (recordDate.isEmpty() ? new Date() : simpleDateFormat.parse(recordDate)); phenoCollectionIntoDB = subjectExistingMatchingPhenoCollections.get(0); } else { subjectCount++; continue; } } for (CustomFieldDisplay cfd : cfdsThatWeNeed) { String theDataAsString = null; customField = cfd.getCustomField(); if (csvReader.getIndex(cfd.getCustomField().getName()) < 0) { for (String nameAsSeenInFile : fieldNameCollection) { if (nameAsSeenInFile.equalsIgnoreCase(cfd.getCustomField().getName())) { theDataAsString = csvReader.get(nameAsSeenInFile); } } } else { theDataAsString = csvReader.get(cfd.getCustomField().getName()); } if (theDataAsString != null && !theDataAsString.isEmpty()) { PhenoData dataToInsert = new PhenoData(); dataToInsert.setCustomFieldDisplay(cfd); // as much as i disagree...pheno data isn't tied to subject....pheno collection is // dataToInsert.setLinkSubjectStudy(subject); setValue(customField, cfd, dataToInsert, theDataAsString); boolean flag = true; for (PhenoData phenoData : phenoCollectionIntoDB.getPhenoData()) { if (phenoData.getCustomFieldDisplay().getId() == cfd.getId()) { phenoData.setDateDataValue(dataToInsert.getDateDataValue()); phenoData.setErrorDataValue(dataToInsert.getErrorDataValue()); phenoData.setNumberDataValue(dataToInsert.getNumberDataValue()); phenoData.setTextDataValue(dataToInsert.getTextDataValue()); flag = false; break; } } if (flag) { phenoDataToInsertForThisPhenoCollection.add(dataToInsert); } insertFieldsCount++; } else { emptyDataCount++; } } phenoCollectionIntoDB.getPhenoData().addAll(phenoDataToInsertForThisPhenoCollection); log.info(phenoCollectionIntoDB.toString()); phenoCollectionsWithTheirDataToInsert.add(phenoCollectionIntoDB); subjectCount++; } log.info( "finished message for " + subjectCount + "\n DATA inserts = " + insertFieldsCount + " phenocollections = " + phenoCollectionsWithTheirDataToInsert.size() + " amount of empty scells =" + emptyDataCount); } catch (IOException ioe) { uploadReport.append( "SYSTEM ERROR: Unexpected I/O exception whilst reading the subject data file\n"); log.error("processMatrixSubjectFile IOException stacktrace:", ioe); throw new ArkSystemException("Unexpected I/O exception whilst reading the subject data file"); } catch (Exception ex) { uploadReport.append( "SYSTEM ERROR: Unexpected exception whilst reading the subject data file\n"); log.error("processMatrixSubjectFile Exception stacktrace:", ex); throw new ArkSystemException( "Unexpected exception occurred when trying to process subject data file"); } finally { uploadReport.append("Total file size: "); uploadReport.append(decimalFormat.format(size / 1024.0 / 1024.0)); uploadReport.append(" MB"); uploadReport.append("\n"); if (csvReader != null) { try { csvReader.close(); } catch (Exception ex) { log.error("Cleanup operation failed: csvRdr.close()", ex); } } if (inputStreamReader != null) { try { inputStreamReader.close(); } catch (Exception ex) { log.error("Cleanup operation failed: isr.close()", ex); } } } uploadReport.append("Processed "); uploadReport.append(subjectCount); uploadReport.append(" rows."); uploadReport.append("\n"); uploadReport.append("Inserted "); uploadReport.append(insertFieldsCount); uploadReport.append(" rows of data."); uploadReport.append("\n"); uploadReport.append("Updated "); uploadReport.append(updateFieldsCount); uploadReport.append(" rows of data."); uploadReport.append("\n"); // TODO better exceptionhandling iPhenotypicService.processPhenoCollectionsWithTheirDataToInsertBatch( phenoCollectionsWithTheirDataToInsert, study); return uploadReport; }
private static void filterCSV( String filename, LinkedList<String> filters, LinkedList<String> includes) throws IOException { FileReader fr = new FileReader(filename); CsvReader csvIn = new CsvReader(fr, SEPARATOR); csvIn.setSafetySwitch(false); if (csvIn.readHeaders()) { csvIn.readRecord(); System.out.println("'" + filename + "' has " + csvIn.getColumnCount() + " column."); int usedColumn = 0; String[] headers = csvIn.getHeaders(); StringBuffer newHeader = new StringBuffer(); StringBuffer newValues = new StringBuffer(); HashMap<String, String> data = new HashMap<String, String>(); allData.put(filename, data); for (String header : headers) { boolean matches = false; // check if a filter matches the entry for (String filter : filters) { if (header.contains(filter)) { matches = true; // ok, filter matches, but maybe it is on the include list? for (String include : includes) { if (header.contains(include)) { matches = false; break; } } break; } } if (!matches) { usedColumn++; String value = csvIn.get(header); newHeader.append(header); newHeader.append(SEPARATOR_OUT); newValues.append(value); newValues.append(SEPARATOR_OUT); if (data != null) { if (!keys.containsKey(header)) { keys.put(header, true); } data.put(header, value); } } } System.out.println(" -> " + usedColumn + " column remains"); FileWriter fw = new FileWriter(filename + FILENAME_POSTFIX, false); fw.write(newHeader.toString()); fw.write(NEW_LINE); fw.write(newValues.toString()); fw.close(); } else { System.err.println("Can not read header from '" + filename + "'"); } }
/** * DOC xqliu Comment method "importIndicatorToStucture". * * @param importObject * @param selectionFolder * @param skip * @param rename * @param importItemName * @return */ public static List<ReturnCode> importIndicatorToStucture( ImportObject importObject, IFolder selectionFolder, boolean skip, boolean rename, String importItemName) { List<ReturnCode> information = new ArrayList<ReturnCode>(); Set<String> names = UDIHelper.getAllIndicatorNames(selectionFolder); File importFile = importObject.getObjFile(); String fileExtName = getFileExtName(importFile); if ("csv".equalsIgnoreCase(fileExtName)) { // $NON-NLS-1$ String name = PluginConstant.EMPTY_STRING; try { CsvReader reader = new CsvReader(new FileReader(importFile), CURRENT_SEPARATOR); // MOD zshen EscapeMode default is CsvReader.ESCAPE_MODE_DOUBLED reader.setTextQualifier(TEXT_QUAL); reader.setUseTextQualifier(USE_TEXT_QUAL); reader.readHeaders(); java.text.SimpleDateFormat simpleDateFormat = new java.text.SimpleDateFormat("yyyyMMddHHmmssSSS"); // $NON-NLS-1$ while (reader.readRecord()) { name = reader.get(PatternToExcelEnum.Label.getLiteral()); if (names.contains(name)) { if (skip) { information.add( new ReturnCode( DefaultMessagesImpl.getString("ImportFactory.Imported", name), false)); //$NON-NLS-1$ continue; } if (rename) { name = name + "(" + simpleDateFormat.format(new Date()) + Math.random() + ")"; //$NON-NLS-1$ //$NON-NLS-2$ } } UDIParameters udiParameters = new ImportFactory().new UDIParameters(); udiParameters.name = name; udiParameters.auther = reader.get(PatternToExcelEnum.Author.getLiteral()); udiParameters.description = reader.get(PatternToExcelEnum.Description.getLiteral()); udiParameters.purpose = reader.get(PatternToExcelEnum.Purpose.getLiteral()); udiParameters.relativePath = reader.get(PatternToExcelEnum.RelativePath.getLiteral()); udiParameters.category = reader.get(PatternToExcelEnum.Category.getLiteral()); udiParameters.javaClassName = reader.get(PatternToExcelEnum.JavaClassName.getLiteral()); udiParameters.javaJarPath = reader.get(PatternToExcelEnum.JavaJarPath.getLiteral()); String[] headers = reader.getHeaders(); String[] columnsValue = reader.getValues(); HashMap<String, String> record = new HashMap<String, String>(); for (int i = 0; i < headers.length; i++) { if (columnsValue[i] != null && columnsValue[i].length() > 0) { record.put(headers[i], columnsValue[i]); } } for (PatternLanguageType languagetype : PatternLanguageType.values()) { String cellStr = record.get(languagetype.getExcelEnum().getLiteral()); if (cellStr != null && !cellStr.equals("\"\"")) { // $NON-NLS-1$ udiParameters.regex.put(languagetype.getLiteral(), trimQuote(cellStr)); } } udiParameters.setParaMap(buildIndDefPara(record)); TypedReturnCode<Object> create = createAndStoreUDI(udiParameters, selectionFolder); if (create.isOk()) { names.add(name); // add the suscess message to display. information.add( new ReturnCode( DefaultMessagesImpl.getString( "ImportFactory.importedSucess" //$NON-NLS-1$ , ((TDQItem) create.getObject()).getProperty().getDisplayName(), selectionFolder.getProjectRelativePath().toString()), true)); } else { throw new TalendInternalPersistenceException(create.getMessage()); } } reader.close(); } catch (Exception e) { log.error(e, e); information.add( new ReturnCode( DefaultMessagesImpl.getString("ImportFactory.importedFailed", name), false)); //$NON-NLS-1$ } } if ("xls".equalsIgnoreCase(fileExtName)) { // $NON-NLS-1$ Map<Integer, PatternLanguageType> expressionMap = new HashMap<Integer, PatternLanguageType>(); String contents = PluginConstant.EMPTY_STRING; try { WorkbookSettings settings = new WorkbookSettings(); settings.setEncoding("UTF-8"); // $NON-NLS-1$ Workbook rwb = Workbook.getWorkbook(importFile, settings); Sheet[] sheets = rwb.getSheets(); for (Sheet sheet : sheets) { Cell[] headerRow = sheet.getRow(0); for (Cell cell : headerRow) { for (PatternLanguageType languageType : PatternLanguageType.values()) { if (cell.getContents().equals(languageType.getExcelEnum().getLiteral())) { expressionMap.put(cell.getColumn(), languageType); } } } for (int i = 1; i < sheet.getRows(); i++) { Cell[] row = sheet.getRow(i); Cell cell = row[0]; if (CellType.LABEL.equals(cell.getType())) { contents = cell.getContents(); if (names.contains(contents)) { if (skip) { continue; } if (rename) { contents = contents + "(" + new Date() + ")"; // $NON-NLS-1$ //$NON-NLS-2$ } } UDIParameters udiParameters = new ImportFactory().new UDIParameters(); udiParameters.name = contents; udiParameters.auther = row[6].getContents(); udiParameters.description = row[2].getContents(); udiParameters.purpose = row[1].getContents(); udiParameters.status = DevelopmentStatus.DRAFT.getLiteral(); udiParameters.category = row[16].getContents(); for (int columnIndex : expressionMap.keySet()) { String rowContent = row[columnIndex].getContents(); if (!rowContent.equals("")) { // $NON-NLS-1$ udiParameters.regex.put(expressionMap.get(columnIndex).getLiteral(), rowContent); } } createAndStoreUDI(udiParameters, selectionFolder); names.add(contents); information.add( new ReturnCode( DefaultMessagesImpl.getString( "ImportFactory.importedSucess" //$NON-NLS-1$ , contents), true)); } } } rwb.close(); } catch (Exception e) { log.error(e, e); information.add( new ReturnCode( DefaultMessagesImpl.getString("ImportFactory.importedFailed", contents), false)); //$NON-NLS-1$ } } // MOD qiongli 2011-11-28 TDQ-4038.consider to import the definition file. if (FactoriesUtil.DEFINITION.equalsIgnoreCase(fileExtName)) { String propFilePath = importFile .getPath() .replaceFirst( PluginConstant.DOT_STRING + fileExtName, PluginConstant.DOT_STRING + FactoriesUtil.PROPERTIES_EXTENSION); File propFile = new File(propFilePath); // just import the definition file which have the realted Property file. if (!propFile.exists()) { return information; } String name = importFile.getName(); try { if (names.contains(name)) { if (skip) { information.add( new ReturnCode( DefaultMessagesImpl.getString("ImportFactory.Imported", name), false)); //$NON-NLS-1$ return information; } if (rename) { name = name + "(" + new Date() + Math.random() + ")"; // $NON-NLS-1$ //$NON-NLS-2$ } } IFile elementFile = selectionFolder.getFile(name); if (!elementFile.exists()) { elementFile.create(new FileInputStream(importFile), false, null); ModelElement modelElement = ModelElementFileFactory.getModelElement(elementFile); if (modelElement != null) { ElementWriterFactory.getInstance() .createIndicatorDefinitionWriter() .create(modelElement, selectionFolder); DefinitionHandler.getInstance().reloadIndicatorsDefinitions(); names.add(name); information.add( new ReturnCode( DefaultMessagesImpl.getString( "ImportFactory.importedSucess" //$NON-NLS-1$ , name), true)); } } } catch (Exception e) { log.error(e); information.add( new ReturnCode( DefaultMessagesImpl.getString("ImportFactory.importedFailed", name), false)); //$NON-NLS-1$ } } importObject.copyJarFiles(); // ADD xqliu 2012-04-27 TDQ-5149 checkImportEvent(importItemName, information); // ~ TDQ-5149 return information; }