@Override protected List<TTDataSet> perform() throws TaskExecutionException, InterruptedException { Preconditions.checkNotNull(sources); List<TTDataSet> datasets = new ArrayList<TTDataSet>(sources.size()); for (TTDataSet dataset : sources) { try { GenericTTDataBuilder builder = new GenericTTDataBuilder(); if (getName() == null) { builder.setName(dataset.getName()); } else { builder.setName(getName()); } for (Map.Entry<String, Object> entry : dataset.getAttributes().entrySet()) { builder.setAttribute(entry.getKey(), entry.getValue()); } builder.setAttribute("Retain", retain); builder.setQuery(dataset.getQueryData()); builder.setTest(dataset.getTestData()); builder.setTrain( downsample(dataset.getTrainingData(), dataset.getTestData().getUserDAO().getUserIds())); datasets.add(builder.build()); } catch (IOException e) { throw new TaskExecutionException(e); } } return datasets; }
private DataSource downsample(DataSource data, LongSet testUsers) throws IOException { String fileName = getFileName(data); File output = new File(fileName); UpToDateChecker checker = new UpToDateChecker(); checker.addInput(data.lastModified()); checker.addOutput(output); if (!checker.isUpToDate()) { RandomOrder<Rating> order = new RandomOrder<Rating>(); Random rng = new Random(); // write datasource CSVWriter csv = null; try { csv = CSVWriter.open(output, null); Cursor<UserHistory<Rating>> histories = data.getUserEventDAO().streamEventsByUser(Rating.class); for (UserHistory<Rating> ratings : histories) { List<Rating> rats = new ArrayList<Rating>(ratings); order.apply(rats, rng); for (int i = 0; i < rats.size(); i++) { if (!testUsers.contains(ratings.getUserId()) || i < retain) { Rating rating = rats.get(i); Preference pref = rating.getPreference(); csv.writeRow( Lists.newArrayList( rating.getUserId(), rating.getItemId(), rating.getValue(), rating.getTimestamp())); } } } } finally { if (csv != null) { csv.close(); } } } CSVDataSourceBuilder builder = new CSVDataSourceBuilder(data.getName()); builder.setDomain(data.getPreferenceDomain()); builder.setFile(output); return builder.build(); }