public String induceRulesWithTestSet(double testSetPercent) throws DataFormatException { SystemVariables.getInstance() .appendToExperimentNotes("\\n* Total number of PSSM matches in positive sequences: "); // System.out.println("Total number of PSSM matches in positive sequences: "); ArrayList<Feature> posTrainingSet = regulatoryElementService.getRegulatoryElements( regulatoryRegionService.getPositiveRegulatoryRegions(), positiveCutOffScore, posATcomposition, posCGcomposition); SystemVariables.getInstance() .appendToExperimentNotes("\\n* Total number of PSSM matches in negative sequences: "); // System.out.println("Total number of PSSM matches in negative sequences: "); ArrayList<Feature> negTrainingSet = regulatoryElementService.getRegulatoryElements( regulatoryRegionService.getNegativeRegulatoryRegions(), negativeCutOffScore, negATcomposition, negCGcomposition); IlpService formatter = new IlpService(tempIlpJobDirName); formatter.createIlpFilesWithTestSet( regulatoryRegionService.getPositiveRegulatoryRegions(), regulatoryRegionService.getNegativeRegulatoryRegions(), posTrainingSet, negTrainingSet, testSetPercent); String ilpTheory = formatter.runILP(); return ilpTheory; }
public IlpService createIlpFiles() throws DataFormatException { ArrayList<Feature> posRegElements = regulatoryElementService.getRegulatoryElements( regulatoryRegionService.getPositiveRegulatoryRegions(), positiveCutOffScore, posATcomposition, posCGcomposition); if (posRegElements == null) { posRegElements = regulatoryElementService.getRegulatoryElements( regulatoryRegionService.getPositiveRegulatoryRegions(), regulatoryRegionService.getNegativeRegulatoryRegions(), positiveCutOffScore); } // ***** Add statistics addMotifMatchingStatisticsToNotes("positive"); SystemVariables.getInstance().setPosSeqRelElMatchesNum(posRegElements.size()); ArrayList<Feature> negRegElements = regulatoryElementService.getRegulatoryElements( regulatoryRegionService.getNegativeRegulatoryRegions(), negativeCutOffScore, negATcomposition, negCGcomposition); if (negRegElements == null) { negRegElements = regulatoryElementService.getRegulatoryElements( regulatoryRegionService.getNegativeRegulatoryRegions(), null, negativeCutOffScore); } // ***** Add statistics addMotifMatchingStatisticsToNotes("negative"); SystemVariables.getInstance().setNegSeqRelElMatchesNum(negRegElements.size()); /* System.out.println("\n Number of matches of pairs of matrices in jurkat and erythroid sequences"); String[] pwmPair = new String[]{"MA0055.1", "MA0152.1"}; System.out.println(pwmPair[0] + " and " + pwmPair[1]+" :\t" + FeaturesTools.numSequencesContainingAll(pwmPair, posRegElements) + "\t"+ FeaturesTools.numSequencesContainingAll(pwmPair, negRegElements)); System.out.println("\nDistances between Gata and Ebox for positive sequences:"); FeaturesTools.printRegElDistances("Gata", "Ebox", posRegElements); */ IlpService ilpService = new IlpService(tempIlpJobDirName); ilpService.createIlpFiles( regulatoryRegionService.getPositiveRegulatoryRegions(), regulatoryRegionService.getNegativeRegulatoryRegions(), posRegElements, negRegElements); return ilpService; }
/* Generates regulatory regions for ILP positive examples. * Base of the sequence is random ACGT 0.25 : 0.25 : 0.25 : 0.25 composition, * cut-off score for finding PSSMs in model sequences is a default positive cut-off * C.elegans pwm are planted in the base sequence at random positions. * * @param int number of regions to be generates * @param String generic prefix for the name of regulatory sequence */ private ArrayList<Feature> generateRegulatoryRegions(int numOfRegions, String regionNamePrefix) throws DataFormatException { ArrayList<Feature> regRegions = new ArrayList<Feature>(); if (numOfRegions <= 0) { return regRegions; } // Generate probabilities based on C.elegans data // RegulatoryRegionService regRegionService = // RegulatoryRegionServiceFactory.getService(RegulatoryRegionService.C_ELEGANS_DATA_SERVICE, 0, // 0); ArrayList<Feature> cElegansRegRegions = modelRegRegionService.getPositiveRegulatoryRegions(); if (cElegansRegRegions == null) { throw new DataFormatException( "SyntheticRegRegionService: Unable to retrieve C.elegans regulatory regions to generate positive sequences."); } File pwmDir = new File(SystemVariables.getInstance().getString("C.elegans.PWMs.dir")); String tmpJobDir = SystemVariables.getInstance() .getString("temp.output.dir"); // TODO create a proper tmp job dir for experiments PatserRegElementService regElService = new PatserRegElementService(pwmDir, tmpJobDir); double cutOffScore = SystemVariables.getInstance().getPositivePatserCutOffScore(); double atComposition = sequenceNtProbabilities[0] + sequenceNtProbabilities[3]; double cgComposition = sequenceNtProbabilities[1] + sequenceNtProbabilities[2]; ArrayList<Feature> regElements = regElService.getRegulatoryElements( cElegansRegRegions, cutOffScore, atComposition, cgComposition); ArrayList<RegulatoryElementPWM> regElementPWMs = regElService.getRegulatoryElementsPWMs(); StatAnalyser stat = new StatAnalyser(regElements); for (int i = 0; i < numOfRegions; i++) { String sequence = generatePosExSequence( stat, regElementPWMs, cElegansRegRegions.get(0).getSequence().length(), cElegansRegRegions.size()); regRegions.add(new Feature(regionNamePrefix + i, "gene", null, 0, 0, sequence, 0.0)); } return regRegions; }
@Override public void updateNumberOfPositiveRegRegions(int num) throws DataFormatException { // TODO check for empty regions // TODO test this int deltaSeqNum = num - positiveRegRegions.size(); if (deltaSeqNum < 0) { // desired number of positive sequences is less than is available now for (int i = 0; i > deltaSeqNum; i--) { positiveRegRegions.remove(0); } } else { // desired number is more than available -> need to generate extra ArrayList<Feature> extraSimulatedGenes = FeaturesTools.generateSimulatedRegulatoryRegionsWithPositionalPSSMs( deltaSeqNum, regulatorySequenceLength, sequenceNtProbabilities, SystemVariables.getInstance().getPositivePatserCutOffScore(), POSITIVE_REGION_PREFIX, modelRegRegionService.getPositiveRegulatoryRegions(), modelRegElService); // generateRegulatoryRegions(deltaSeqNum, POSITIVE_REGION_PREFIX); negativeRegRegions.addAll(extraSimulatedGenes); } }
private String createSequencesFile(ArrayList<Feature> regRegions) throws DataFormatException { if (regRegions == null) { throw new DataFormatException( "Can not create regulatory sequences file for PATSER. No regulatory regions were supplied."); } if (tempPatserOutputDir == null || tempPatserOutputDir.isEmpty()) { // in case the constructor workaround was used throw new DataFormatException( "Can not create regulatory sequences file for PATSER. No temporary output directory was set."); } String seqFileName = tempPatserOutputDir + SystemVariables.getInstance().getString("patser.tmp.seq.output.file.name") + System.currentTimeMillis(); BufferedWriter writer = null; try { writer = new BufferedWriter(new FileWriter(seqFileName)); for (Iterator<Feature> iterator = regRegions.iterator(); iterator.hasNext(); ) { Feature gene = (Feature) iterator.next(); writer.write(gene.getId() + " \\" + gene.getSequence() + "\\\n"); } writer.close(); } catch (IOException e) { e.printStackTrace(); } return seqFileName; }
public SyntheticRegRegionService(int numOfPositiveRegRegions, int negExMultiplicationFactor) throws DataFormatException { modelRegRegionService = new CElegansRegRegionService(0); File pwmDir = new File(SystemVariables.getInstance().getString("C.elegans.PWMs.dir")); String tmpJobDir = SystemVariables.getInstance() .getString("temp.output.dir"); // TODO create a proper tmp job dir for experiments modelRegElService = new PatserRegElementService(pwmDir, tmpJobDir); if (modelRegRegionService.getPositiveRegulatoryRegions().size() > 0) { regulatorySequenceLength = modelRegRegionService.getPositiveRegulatoryRegions().get(0).getSequence().length(); } else { throw new DataFormatException( "Model regulatory regions are empty. Can not model synthetic regions based on an empty regions."); } // TODO: remove - just for testing regulatorySequenceLength = 25; negativeRegRegions = FeaturesTools.generateSimulatedRegulatoryRegions( negExMultiplicationFactor * numOfPositiveRegRegions, regulatorySequenceLength, NEGATIVE_REGION_PREFIX, sequenceNtProbabilities); positiveRegRegions = FeaturesTools.generateSimulatedRegulatoryRegionsWithPositionalPSSMs( numOfPositiveRegRegions, regulatorySequenceLength, sequenceNtProbabilities, SystemVariables.getInstance().getPositivePatserCutOffScore(), POSITIVE_REGION_PREFIX, modelRegRegionService.getPositiveRegulatoryRegions(), modelRegElService); // this.generateRegulatoryRegions(numOfPositiveRegRegions, ); }
/* Note: unsound method * Based on the statistics, collected when extracting motif matches by RegElementService, * collects text note of motif matching statistics, which will be added to ILP file. * This method has to be called immediately after RegElementService.getRegulatoryElements call, * since every such call overrides the pssmMatchStatistics in RegElementService * * @param sequenceKind - can only be "positive" or "negative", depending on the kind of sequences */ private void addMotifMatchingStatisticsToNotes(String sequenceKind) throws DataFormatException { Hashtable<String, Double> pssmMatchStats = regulatoryElementService.getPssmMatchingStatistics(); if (pssmMatchStats != null && !pssmMatchStats.isEmpty()) { SystemVariables.getInstance() .appendToExperimentNotes( "\\n* Number of PSSM matches in " + sequenceKind + " sequences: "); int numberOfRegions = 0; if ("positive".equals(sequenceKind)) { numberOfRegions = regulatoryRegionService.getPositiveRegulatoryRegions().size(); } else { numberOfRegions = regulatoryRegionService.getNegativeRegulatoryRegions().size(); } Enumeration<String> pssmMatchNames = pssmMatchStats.keys(); int totalNumMatches = 0; String r_pssmNames = ""; String r_matches = ""; while (pssmMatchNames.hasMoreElements()) { String pssmName = pssmMatchNames.nextElement(); double pssmStat = pssmMatchStats.get(pssmName); int numMatches = (int) (pssmStat * numberOfRegions); totalNumMatches = totalNumMatches + numMatches; r_pssmNames = r_pssmNames + "\'" + pssmName + "\', "; r_matches = r_matches + numMatches + ", "; SystemVariables.getInstance() .appendToExperimentNotes("\\n\\t" + pssmName + "\\t" + numMatches); } SystemVariables.getInstance() .appendToExperimentNotes("\\n Total number of matches: " + totalNumMatches); // System.out.println("In positive sequences: "); // System.out.println(r_pssmNames); // System.out.println(r_matches); } }
/* * Runs ILP */ public String induceRules() throws DataFormatException { IlpService ilpService = createIlpFiles(); System.out.println("== Created all ILP files. Starting to induce."); // // ==> the star of the show // ilpService.runILP(); // Overwrite temporary result html footer with the one that explains how to read a theory String htmlFooterFileName = tempIlpJobDirName + SystemVariables.getInstance().getString("html.footer.file.name"); FileHandling.writeFile(htmlFooterFileName, FileHandling.getHTMLResultsFooter(true, true)); return "Done"; }
public Explorer( RegulatoryRegionService regRegionService, RegulatoryElementService regElService, String tempIlpJobDirName) throws DataFormatException { this.regulatoryRegionService = regRegionService; this.regulatoryElementService = regElService; this.tempIlpJobDirName = tempIlpJobDirName; positiveCutOffScore = SystemVariables.getInstance().getPositivePatserCutOffScore(); negativeCutOffScore = SystemVariables.getInstance().getNegativePatserCutOffScore(); /////// Statistics SystemVariables.getInstance().cleanStatistics(); // clean old; SystemVariables.getInstance() .setPosSeqNum(regulatoryRegionService.getPositiveRegulatoryRegions().size()); SystemVariables.getInstance() .setNegSeqNum(regulatoryRegionService.getNegativeRegulatoryRegions().size()); SystemVariables.getInstance() .appendToExperimentNotes( "\\n* Number of positive sequences: " + regulatoryRegionService.getPositiveRegulatoryRegions().size()); SystemVariables.getInstance() .appendToExperimentNotes( "\\n* Number of negative sequences: " + regulatoryRegionService.getNegativeRegulatoryRegions().size()); SystemVariables.getInstance() .appendToExperimentNotes( "\\n* Patser cut-off score for positive sequences: " + positiveCutOffScore); SystemVariables.getInstance() .appendToExperimentNotes( "\\n* Patser cut-off score for negative sequences: " + negativeCutOffScore); double[] posNtComposition = FeaturesTools.getNucleotideComposition( regulatoryRegionService.getPositiveRegulatoryRegions()); posATcomposition = posNtComposition[0] + posNtComposition[3]; posCGcomposition = posNtComposition[1] + posNtComposition[2]; SystemVariables.getInstance() .appendToExperimentNotes( "\\n* A:T and C:G composition of positive sequences: " + posATcomposition + " " + posCGcomposition); SystemVariables.getInstance().setPosATcomposition(posATcomposition); SystemVariables.getInstance().setPosCGcomposition(posCGcomposition); double[] negNtComposition = FeaturesTools.getNucleotideComposition( regulatoryRegionService.getNegativeRegulatoryRegions()); negATcomposition = negNtComposition[0] + negNtComposition[3]; negCGcomposition = negNtComposition[1] + negNtComposition[2]; SystemVariables.getInstance() .appendToExperimentNotes( "\\n* A:T and C:G composition of negative sequences: " + negATcomposition + " " + negCGcomposition); SystemVariables.getInstance().setNegATcomposition(negATcomposition); SystemVariables.getInstance().setNegCGcomposition(negCGcomposition); }
/* Parses and saves an input stream of several PWMs. Each PWM is saved * in separate file (for Patser) * Expected input stream format: * > matrixName * A | 10 20 30 * C | 20 0 ... */ public static void savePwmFiles(String dirName, String pwmsStr) throws DataFormatException, IOException { int maxPwmNum = Integer.parseInt(SystemVariables.getInstance().getString("regEl.max.pwm.num")); String line = null; String pwmName = ""; int pwmLineCount = 0; int pwmColNum = -1; String pwmString = ""; StringTokenizer stMain = new StringTokenizer(pwmsStr, "\n\r"); int currPwmNum = 0; while (stMain.hasMoreTokens()) { line = stMain.nextToken(); line = line.trim(); if (!(line.matches("\\s*"))) { // ignore blank lines if (pwmName.isEmpty()) { // expecting and annotated line StringTokenizer st = new StringTokenizer(line, " \t"); String token = st.nextToken(); // Check for the line to start with ">" and read the first token after it. The rest of the // line is ignored if (!">".equals(token)) { throw new DataFormatException( "Supplied PSSM(s) are not in the correct format. Each PSSM should be preceded by a line starting with \"> pssm_name\""); } if (st.hasMoreTokens()) { pwmName = st.nextToken().trim(); } else { throw new DataFormatException( "Supplied PSSM(s) are not in the correct format. PSSM name should follow \">\"."); } } else { // expecting a pwm line if (line.matches("[ACGT][\\t ]+\\|([\\t ]+[0-9]+)+")) { int currColNum = line.split("[\\t ]+").length; if (pwmColNum < 0) { pwmColNum = currColNum; } else { if (currColNum != pwmColNum) { throw new DataFormatException( "Supplied PSSM(s) are not in the correct format. Unequal length of rows in one PSSM."); } } pwmString = pwmString + line + "\n"; pwmLineCount++; } else { throw new DataFormatException("Supplied PSSM(s) are not in the correct format. "); } } if (!pwmName.isEmpty() && pwmLineCount == 4) { // got all the info for writing a matrix currPwmNum++; // if (currPwmNum > maxPwmNum){ // throw new DataFormatException("Number of biological markers (PSSMs) has exceeded // the limit of " + // maxPwmNum + "."); // } BufferedWriter writer = new BufferedWriter(new FileWriter(dirName + pwmName + ".matrix")); writer.write(pwmString); writer.close(); pwmName = ""; pwmLineCount = 0; pwmColNum = -1; pwmString = ""; } } } }
// TODO: faze it out - now I use Hash as above. Make sure that the score public ArrayList<RegulatoryElementPWM> getRegulatoryElementsPWMs() throws DataFormatException { String pwmLineRegEx1 = "[ACGTacgt][ \t]*\\|([ \t]*\\d+)+[ \t]*"; String pwmLineRegEx2 = "[ACGTacgt][ \t]*\\[([ \t]*\\d+)+[ \t]*\\][ \t]*"; ArrayList<RegulatoryElementPWM> pwms = new ArrayList<RegulatoryElementPWM>(); // *** Get file names of all matrixes final String[] matrixFileNames = new File(matrixFilesDir).list(); if (matrixFileNames == null) { throw new DataFormatException( "Patser Regulatory Element Service has no PSSMs associated with it."); } for (int i = 0; i < matrixFileNames.length; i++) { // Only read files with .matrix extension if (!matrixFileNames[i].endsWith(SystemVariables.getInstance().getString("pwm.extension"))) { continue; } RegulatoryElementPWM currPwmObj = new RegulatoryElementPWM(); String tfbsName = matrixFileNames[i].substring(0, matrixFileNames[i].length() - 7); currPwmObj.setName(tfbsName); BufferedReader bufferedReader = null; try { // for Mac bufferedReader = new BufferedReader(new FileReader(matrixFilesDir + "/" + matrixFileNames[i])); // bufferedReader = new BufferedReader(new FileReader(matrixFilesDir + matrixFileNames[i])); String line = null; int[][] pwmArr = null; int j = -1; while (null != (line = bufferedReader.readLine())) { if (line.matches(pwmLineRegEx1) || line.matches(pwmLineRegEx2)) { j++; if (j > 3) throw new DataFormatException( "Error parsing matrix file <" + tfbsName + ".matrix>. Unexpected line in the file. File should contain only one matrix."); line = line.replaceAll("[ACGTacgt\\|\\[\\]]", ""); // remove everything but the numbers StringTokenizer strTok = new StringTokenizer(line); int lengthOfPwm = strTok.countTokens(); if (pwmArr == null) { pwmArr = new int[lengthOfPwm][4]; } if (lengthOfPwm != pwmArr.length) throw new DataFormatException( "Error parsing matrix file <" + tfbsName + ".matrix>. Matrix is unbalanced."); int k = -1; while (strTok.hasMoreElements()) { k++; String token = strTok.nextToken(); pwmArr[k][j] = Integer.parseInt(token); } } } currPwmObj.setPwm(pwmArr); } catch (FileNotFoundException ex) { ex.printStackTrace(); } catch (IOException ex) { ex.printStackTrace(); } finally { try { if (bufferedReader != null) bufferedReader.close(); } catch (IOException ex) { ex.printStackTrace(); } } pwms.add(currPwmObj); } // matrix for return pwms; }
/* Reads the PSSM directory (saved in this instance) with a number of files that contain PSSMs and creates a Hashmap of * these PSSMs keyed by name. * - Each pwm file should have a .matrix extension. "." are permitted in the name (i.e. ma.123.my.matrix is acceptable) * - Each file should have only one pwm in it. * - No "> pmw.." lines accepted in a file. I.e nothing but the actual matrix should be present in a file * - pwms can be of 2 formats: A | 1 2 3... or A [ 1 2 3]... * - blank lines are permitted in the file */ private Hashtable<String, int[][]> readPssmsFromFileSystem() throws DataFormatException { Hashtable<String, int[][]> resultPssms = new Hashtable<String, int[][]>(); String pssmLineRegEx1 = "[ACGTacgt][ \t]*\\|([ \t]*\\d+)+[ \t]*"; String pssmLineRegEx2 = "[ACGTacgt][ \t]*\\[([ \t]*\\d+)+[ \t]*\\][ \t]*"; // *** Get file names of all matrixes final String[] matrixFileNames = new File(matrixFilesDir).list(); for (int i = 0; i < matrixFileNames.length; i++) { // Only read files with .matrix extension if (!matrixFileNames[i].endsWith(SystemVariables.getInstance().getString("pwm.extension"))) { continue; } String pssmName = matrixFileNames[i].substring(0, matrixFileNames[i].length() - 7); BufferedReader bufferedReader = null; try { // for Mac bufferedReader = new BufferedReader(new FileReader(matrixFilesDir + "/" + matrixFileNames[i])); // bufferedReader = new BufferedReader(new FileReader(matrixFilesDir + matrixFileNames[i])); String line = null; int[][] pssmMatrix = null; int j = -1; while (null != (line = bufferedReader.readLine())) { if (line.matches(pssmLineRegEx1) || line.matches(pssmLineRegEx2)) { j++; if (j > 3) throw new DataFormatException( "Error parsing matrix file <" + pssmName + ".matrix>. Unexpected line in the file. File should contain only one matrix."); line = line.replaceAll("[ACGTacgt\\|\\[\\]]", ""); // remove everything but the numbers StringTokenizer strTok = new StringTokenizer(line); int lengthOfPwm = strTok.countTokens(); if (pssmMatrix == null) { pssmMatrix = new int[lengthOfPwm][4]; } if (lengthOfPwm != pssmMatrix.length) throw new DataFormatException( "Error parsing matrix file <" + pssmName + ".matrix>. Matrix is unbalanced."); int k = -1; while (strTok.hasMoreElements()) { k++; String token = strTok.nextToken(); pssmMatrix[k][j] = Integer.parseInt(token); } } } resultPssms.put(pssmName, pssmMatrix); } catch (FileNotFoundException ex) { ex.printStackTrace(); } catch (IOException ex) { ex.printStackTrace(); } finally { try { if (bufferedReader != null) bufferedReader.close(); } catch (IOException ex) { ex.printStackTrace(); } } } // matrix for return resultPssms; }
public class PatserRegElementService implements RegulatoryElementService { private String patserInstallDirName = SystemVariables.getInstance().getString("patser.install.dir"); private String matrixFilesDir; private String tempPatserOutputDir; private Hashtable<String, Double> pssmMatchingStats; // keeps track of number of PSSM matches in all sequences / by # of // sequences private Hashtable<String, int[][]> pssms; // hashed by name /* * @param pwmDir name of the directory, which contains one or more pwm files */ public PatserRegElementService(File pwmDir, String tempJobDir) throws DataFormatException { if (tempJobDir != null && !tempJobDir.isEmpty()) { // workaround for the web interface bean: // to load example data we only need pwm dir to extract pwm info this.tempPatserOutputDir = FileHandling.createTempPatserOutputDirectory(tempJobDir); } this.matrixFilesDir = pwmDir.getAbsolutePath(); // TODO } /* Parses a string with multiple PWMs and writes each pwm in an individual, Patser-approved file * in a standard pwm directory, created inside a temporary directory specified. * * @param pmws String with one or more PWMs. The format of the String is: * > pmwName * A | 10 20 1 or A [10 20 1] * C | 0 22 ... C [0 22 ...] * * @param tempJobDir name of a temporary directory for the whole Module Inducer run. * A new directory will be created inside this directory to hold * all the pwm files. */ public PatserRegElementService(String pwms, String tempJobDir) throws DataFormatException, IOException { this.tempPatserOutputDir = FileHandling.createTempPatserOutputDirectory(tempJobDir); this.matrixFilesDir = FileHandling.createTempPwmDirectory(tempPatserOutputDir); savePwmFiles(matrixFilesDir, pwms); // matrixFilesDir = (new File(matrixFilesDir)).getAbsolutePath(); //TODO stupid windows hack } private String createSequencesFile(ArrayList<Feature> regRegions) throws DataFormatException { if (regRegions == null) { throw new DataFormatException( "Can not create regulatory sequences file for PATSER. No regulatory regions were supplied."); } if (tempPatserOutputDir == null || tempPatserOutputDir.isEmpty()) { // in case the constructor workaround was used throw new DataFormatException( "Can not create regulatory sequences file for PATSER. No temporary output directory was set."); } String seqFileName = tempPatserOutputDir + SystemVariables.getInstance().getString("patser.tmp.seq.output.file.name") + System.currentTimeMillis(); BufferedWriter writer = null; try { writer = new BufferedWriter(new FileWriter(seqFileName)); for (Iterator<Feature> iterator = regRegions.iterator(); iterator.hasNext(); ) { Feature gene = (Feature) iterator.next(); writer.write(gene.getId() + " \\" + gene.getSequence() + "\\\n"); } writer.close(); } catch (IOException e) { e.printStackTrace(); } return seqFileName; } /* Finds regulatory elements in regulatory sequences * @param regRegions - list of regulatory regions in which to look for the reg. elements * @param cutOffScore - minimum score cut off (-ls option in Patser). Matches with lower * score will not be accepted */ public ArrayList<Feature> getRegulatoryElements( ArrayList<Feature> regRegions, double cutOffScore, double atComposition, double cgComposition) throws DataFormatException { /* Verified: * When Patser matches an "R" sequence (reverse complement), it reports a start position in the * original sequence of the reverse complement of a PSSM. */ ///// Patser parameters : // double cutOffScore = SystemVariables.getInstance().getString(""); ArrayList<Feature> tfbsHits = new ArrayList<Feature>(); // *** Write genes into file acceptable by PATSER final String seqFileName = createSequencesFile(regRegions); // *** Get file names of all matrixes final String[] matrixFileNames = new File(matrixFilesDir).list(); pssmMatchingStats = new Hashtable<String, Double>(); for (int i = 0; i < matrixFileNames.length; i++) { // Only read files with .matrix extension if (!matrixFileNames[i].endsWith(SystemVariables.getInstance().getString("pwm.extension"))) { continue; } // *** Run PATSER tool and extract tfbs info from its output try { Runtime rt = Runtime.getRuntime(); // Process pr = rt.exec("cmd /c dir"); // Process pr = rt.exec("pwd"); File patserDir = new File(patserInstallDirName); Process pr; if (System.getProperty("os.name").startsWith("Mac")) { final String cmd = "./patser-v3e -A a:t " + atComposition + " c:g " + cgComposition + " -b 1 -c -d1 -ls " + cutOffScore + " -p -s -m " + matrixFilesDir + "/" + matrixFileNames[i] + " -f " + seqFileName; pr = rt.exec(new String[] {"/bin/sh", "-c", cmd}, null, patserDir); // pr = rt.exec(cmd); // pr = rt.exec( new String[] { "patser-v3e", "-A", "a:t 0.25 c:g 0.25", "-b", "1", "-c", // "-d1", "-ls", "7", "-p", "-s", "-m", matrixFileNames[i], "-f", seqFileName } ); } else { // i.e. Windows String patserCommand; // Stupid windows hack if (matrixFilesDir.endsWith("/")) { patserCommand = "cmd /c patser-v3e -A a:t " + atComposition + " c:g " + cgComposition + " -b 1 -c -d1 -ls " + cutOffScore + " -p -s -m \"" + matrixFilesDir + matrixFileNames[i] + "\" -f \"" + seqFileName + "\""; } else { patserCommand = "cmd /c patser-v3e -A a:t " + atComposition + " c:g " + cgComposition + " -b 1 -c -d1 -ls " + cutOffScore + " -p -s -m \"" + matrixFilesDir + "\\" + matrixFileNames[i] + "\" -f \"" + seqFileName + "\""; } // String patserCommand = "cmd /c patser-v3e -A a:t 0.25 c:g 0.25 -b 1 -c -d1 -ls 7 -p // -s" // + " -m " + matrixFilesDir + matrixFileNames[i] + " -f " + seqFileName; pr = rt.exec(patserCommand, new String[] {"PATH=C:/cygwin/bin"}, patserDir); } BufferedReader input = new BufferedReader(new InputStreamReader(pr.getInputStream())); String line = null; String pssmName = matrixFileNames[i].substring(0, matrixFileNames[i].length() - 7); int hits = 0; String lastGeneName = ""; // TODO String word; // Read the input and filter out matrix hits data while ((line = input.readLine()) != null) { // System.out.println(line); if (line.contains("position=") && line.contains("score=")) { // result line Feature regElement = new Feature("TF_binding_site"); // Set tfbs name, but filter the .matrix extension regElement.setName(pssmName); // hit.setNameAttribute(matrixFileNames[i]); StringTokenizer st = new StringTokenizer(line); // first one is the name word = st.nextToken(); regElement.setParent(word); while (st.hasMoreTokens()) { word = st.nextToken(); if (word.equals("position=")) { word = st.nextToken(); if (word.endsWith("C")) { regElement.setStrand("R"); word = word.substring(0, word.indexOf("C")); } else { regElement.setStrand("D"); } regElement.setStartPosition(Integer.parseInt(word)); } else if (word.equals("score=")) { word = st.nextToken(); regElement.setScore(Double.parseDouble(word)); } else if (word.equals("sequence=")) { word = st.nextToken(); regElement.setSequence(word); // Now that we have TFBS sequence, we can set the end position of TFBS: regElement.setEndPosition(regElement.getStartPosition() + word.length()); } } // end of tokenizer // This is to calculate statistics of PSSM hits // if (!lastGeneName.equals(regElement.getParent())){ hits++; lastGeneName = regElement.getParent(); // } tfbsHits.add(regElement); } } // end of reading patser input pssmMatchingStats.put(pssmName, (double) hits / regRegions.size()); // int exitVal = pr.waitFor(); // System.out.println("Exited with error code " + exitVal); } catch (IOException e) { e.printStackTrace(); throw new DataFormatException("PATSER execution failed. "); } } File seqFile = new File(seqFileName); seqFile.delete(); /* // add begin and end of sequence "fake" matches, or markers; like ^ and $ for begin and ed sequence in regEx // this is for ILP to be able to induce rules with begin and end of sequence (distance, etc.) for (Iterator<Feature> iterator = regRegions.iterator(); iterator.hasNext();) { Feature feature = (Feature) iterator.next(); // markers are place before the sequence begins and after it ends tfbsHits.add(new Feature("generic_marker", "begin", feature.getId(), "D", 0, 0, 0.0)); tfbsHits.add(new Feature("generic_marker", "end", feature.getId(), "D", feature.getSequence().length()+2, feature.getSequence().length()+2, 0.0)); } */ if (tfbsHits.isEmpty()) { throw new DataFormatException("PATSER did not locate any motifs in the data."); } return tfbsHits; } /* Statistics for the last Patser run. With every getRegulatoryElements() run it gets overwritten. * * (non-Javadoc) * @see ca.uottawa.okorol.bioinf.ModuleInducer.interfaces.RegulatoryElementService#getPssmMatchingStatistics() */ public Hashtable<String, Double> getPssmMatchingStatistics() { return pssmMatchingStats; } /* Returns PSSMs used in this Patser instance, keyed by name */ public Hashtable<String, int[][]> getPssms() throws DataFormatException { if (pssms == null || pssms.isEmpty()) { pssms = readPssmsFromFileSystem(); } return pssms; } /* Reads the PSSM directory (saved in this instance) with a number of files that contain PSSMs and creates a Hashmap of * these PSSMs keyed by name. * - Each pwm file should have a .matrix extension. "." are permitted in the name (i.e. ma.123.my.matrix is acceptable) * - Each file should have only one pwm in it. * - No "> pmw.." lines accepted in a file. I.e nothing but the actual matrix should be present in a file * - pwms can be of 2 formats: A | 1 2 3... or A [ 1 2 3]... * - blank lines are permitted in the file */ private Hashtable<String, int[][]> readPssmsFromFileSystem() throws DataFormatException { Hashtable<String, int[][]> resultPssms = new Hashtable<String, int[][]>(); String pssmLineRegEx1 = "[ACGTacgt][ \t]*\\|([ \t]*\\d+)+[ \t]*"; String pssmLineRegEx2 = "[ACGTacgt][ \t]*\\[([ \t]*\\d+)+[ \t]*\\][ \t]*"; // *** Get file names of all matrixes final String[] matrixFileNames = new File(matrixFilesDir).list(); for (int i = 0; i < matrixFileNames.length; i++) { // Only read files with .matrix extension if (!matrixFileNames[i].endsWith(SystemVariables.getInstance().getString("pwm.extension"))) { continue; } String pssmName = matrixFileNames[i].substring(0, matrixFileNames[i].length() - 7); BufferedReader bufferedReader = null; try { // for Mac bufferedReader = new BufferedReader(new FileReader(matrixFilesDir + "/" + matrixFileNames[i])); // bufferedReader = new BufferedReader(new FileReader(matrixFilesDir + matrixFileNames[i])); String line = null; int[][] pssmMatrix = null; int j = -1; while (null != (line = bufferedReader.readLine())) { if (line.matches(pssmLineRegEx1) || line.matches(pssmLineRegEx2)) { j++; if (j > 3) throw new DataFormatException( "Error parsing matrix file <" + pssmName + ".matrix>. Unexpected line in the file. File should contain only one matrix."); line = line.replaceAll("[ACGTacgt\\|\\[\\]]", ""); // remove everything but the numbers StringTokenizer strTok = new StringTokenizer(line); int lengthOfPwm = strTok.countTokens(); if (pssmMatrix == null) { pssmMatrix = new int[lengthOfPwm][4]; } if (lengthOfPwm != pssmMatrix.length) throw new DataFormatException( "Error parsing matrix file <" + pssmName + ".matrix>. Matrix is unbalanced."); int k = -1; while (strTok.hasMoreElements()) { k++; String token = strTok.nextToken(); pssmMatrix[k][j] = Integer.parseInt(token); } } } resultPssms.put(pssmName, pssmMatrix); } catch (FileNotFoundException ex) { ex.printStackTrace(); } catch (IOException ex) { ex.printStackTrace(); } finally { try { if (bufferedReader != null) bufferedReader.close(); } catch (IOException ex) { ex.printStackTrace(); } } } // matrix for return resultPssms; } /* * (non-Javadoc) * @see ca.uottawa.okorol.bioinf.ModuleInducer.interfaces.RegulatoryElementService#getRegulatoryElementsPWMs() * * Reads a directory with a number of files that contain pwms and creates an ArrayList of pwm objects. * - Each pwm file should have a .matrix extension. "." are permitted in the name (i.e. ma.123.my.matrix is acceptable) * - Each file should have only one pwm in it. * - No "> pmw.." lines accepted in a file. I.e nothing but the actual matrix should be present in a file * - pwms can be of 2 formats: A | 1 2 3... or A [ 1 2 3]... * - blank lines are permitted in the file * - */ // TODO: faze it out - now I use Hash as above. Make sure that the score public ArrayList<RegulatoryElementPWM> getRegulatoryElementsPWMs() throws DataFormatException { String pwmLineRegEx1 = "[ACGTacgt][ \t]*\\|([ \t]*\\d+)+[ \t]*"; String pwmLineRegEx2 = "[ACGTacgt][ \t]*\\[([ \t]*\\d+)+[ \t]*\\][ \t]*"; ArrayList<RegulatoryElementPWM> pwms = new ArrayList<RegulatoryElementPWM>(); // *** Get file names of all matrixes final String[] matrixFileNames = new File(matrixFilesDir).list(); if (matrixFileNames == null) { throw new DataFormatException( "Patser Regulatory Element Service has no PSSMs associated with it."); } for (int i = 0; i < matrixFileNames.length; i++) { // Only read files with .matrix extension if (!matrixFileNames[i].endsWith(SystemVariables.getInstance().getString("pwm.extension"))) { continue; } RegulatoryElementPWM currPwmObj = new RegulatoryElementPWM(); String tfbsName = matrixFileNames[i].substring(0, matrixFileNames[i].length() - 7); currPwmObj.setName(tfbsName); BufferedReader bufferedReader = null; try { // for Mac bufferedReader = new BufferedReader(new FileReader(matrixFilesDir + "/" + matrixFileNames[i])); // bufferedReader = new BufferedReader(new FileReader(matrixFilesDir + matrixFileNames[i])); String line = null; int[][] pwmArr = null; int j = -1; while (null != (line = bufferedReader.readLine())) { if (line.matches(pwmLineRegEx1) || line.matches(pwmLineRegEx2)) { j++; if (j > 3) throw new DataFormatException( "Error parsing matrix file <" + tfbsName + ".matrix>. Unexpected line in the file. File should contain only one matrix."); line = line.replaceAll("[ACGTacgt\\|\\[\\]]", ""); // remove everything but the numbers StringTokenizer strTok = new StringTokenizer(line); int lengthOfPwm = strTok.countTokens(); if (pwmArr == null) { pwmArr = new int[lengthOfPwm][4]; } if (lengthOfPwm != pwmArr.length) throw new DataFormatException( "Error parsing matrix file <" + tfbsName + ".matrix>. Matrix is unbalanced."); int k = -1; while (strTok.hasMoreElements()) { k++; String token = strTok.nextToken(); pwmArr[k][j] = Integer.parseInt(token); } } } currPwmObj.setPwm(pwmArr); } catch (FileNotFoundException ex) { ex.printStackTrace(); } catch (IOException ex) { ex.printStackTrace(); } finally { try { if (bufferedReader != null) bufferedReader.close(); } catch (IOException ex) { ex.printStackTrace(); } } pwms.add(currPwmObj); } // matrix for return pwms; } /* Parses and saves an input stream of several PWMs. Each PWM is saved * in separate file (for Patser) * Expected input stream format: * > matrixName * A | 10 20 30 * C | 20 0 ... */ public static void savePwmFiles(String dirName, String pwmsStr) throws DataFormatException, IOException { int maxPwmNum = Integer.parseInt(SystemVariables.getInstance().getString("regEl.max.pwm.num")); String line = null; String pwmName = ""; int pwmLineCount = 0; int pwmColNum = -1; String pwmString = ""; StringTokenizer stMain = new StringTokenizer(pwmsStr, "\n\r"); int currPwmNum = 0; while (stMain.hasMoreTokens()) { line = stMain.nextToken(); line = line.trim(); if (!(line.matches("\\s*"))) { // ignore blank lines if (pwmName.isEmpty()) { // expecting and annotated line StringTokenizer st = new StringTokenizer(line, " \t"); String token = st.nextToken(); // Check for the line to start with ">" and read the first token after it. The rest of the // line is ignored if (!">".equals(token)) { throw new DataFormatException( "Supplied PSSM(s) are not in the correct format. Each PSSM should be preceded by a line starting with \"> pssm_name\""); } if (st.hasMoreTokens()) { pwmName = st.nextToken().trim(); } else { throw new DataFormatException( "Supplied PSSM(s) are not in the correct format. PSSM name should follow \">\"."); } } else { // expecting a pwm line if (line.matches("[ACGT][\\t ]+\\|([\\t ]+[0-9]+)+")) { int currColNum = line.split("[\\t ]+").length; if (pwmColNum < 0) { pwmColNum = currColNum; } else { if (currColNum != pwmColNum) { throw new DataFormatException( "Supplied PSSM(s) are not in the correct format. Unequal length of rows in one PSSM."); } } pwmString = pwmString + line + "\n"; pwmLineCount++; } else { throw new DataFormatException("Supplied PSSM(s) are not in the correct format. "); } } if (!pwmName.isEmpty() && pwmLineCount == 4) { // got all the info for writing a matrix currPwmNum++; // if (currPwmNum > maxPwmNum){ // throw new DataFormatException("Number of biological markers (PSSMs) has exceeded // the limit of " + // maxPwmNum + "."); // } BufferedWriter writer = new BufferedWriter(new FileWriter(dirName + pwmName + ".matrix")); writer.write(pwmString); writer.close(); pwmName = ""; pwmLineCount = 0; pwmColNum = -1; pwmString = ""; } } } } public static void savePwmFiles(String dirName, InputStream in) throws DataFormatException, IOException { StringBuffer out = new StringBuffer(); byte[] b = new byte[4096]; for (int n; (n = in.read(b)) != -1; ) { out.append(new String(b, 0, n)); } savePwmFiles(dirName, out.toString()); } /* Phased out. Insted, convert the InputStream to a String and use a method for String public static void savePwmFiles(String dirName, InputStream in) throws DataFormatException, IOException{ BufferedReader input = new BufferedReader(new InputStreamReader(in)); String line = null; String pwmName = ""; int pwmLineCount = 0; int pwmColNum = -1; String pwmString = ""; while ((line = input.readLine()) != null) { if (!(line.matches("\\s*"))){ //ignore blank lines if (pwmName.isEmpty()) { // expecting and annotated line StringTokenizer st = new StringTokenizer(line, " \t"); String token = st.nextToken(); // Check for the line to start with ">" and read the first token after it. The rest of the line is ignored if (!">".equals(token)){ throw new DataFormatException("Supplied PSSM(s) are not in the correct format. Each PSSM should be preceded by a line starting with \"> pssm_name\""); } if (st.hasMoreTokens()){ pwmName = st.nextToken(); }else { throw new DataFormatException("Supplied PSSM(s) are not in the correct format. PSSM name should follow \">\"."); } } else { //expecting a pwm line if (line.matches("[ACGT][\\t ]+\\|([\\t ]+[0-9]+)+")){ int currColNum = line.split("[\\t ]+").length; if (pwmColNum < 0){ pwmColNum = currColNum; } else { if (currColNum != pwmColNum){ throw new DataFormatException("Supplied PSSM(s) are not in the correct format. Unequal length of rows in one PSSM."); } } pwmString = pwmString + line + "\n"; pwmLineCount++; } else { throw new DataFormatException("Supplied PSSM(s) are not in the correct format. "); } } if (!pwmName.isEmpty() && pwmLineCount == 4){ // got all the info for writing a matrix BufferedWriter writer = new BufferedWriter(new FileWriter(dirName + pwmName + ".matrix")); writer.write(pwmString); writer.close(); pwmName = ""; pwmLineCount = 0; pwmColNum = -1; pwmString = ""; } } } } */ /* Parses and saves an input stream of several PWMs. Each PWM is saved * in separate file (for Patser) * Expected input stream format: * > matrixName * A [ 10 20 30 ] * C [ 20 0 ... * * Converted files should be of format A | 10 20 10... */ public static void saveAndConvertPwmFiles(String dirName, InputStream in) throws DataFormatException, IOException { BufferedReader input = new BufferedReader(new InputStreamReader(in)); String line = null; String pwmName = ""; int pwmLineCount = 0; String pwmString = ""; while ((line = input.readLine()) != null) { if (!(line.matches("\\s*"))) { // ignore blank lines if (pwmName.isEmpty()) { // expecting and annotated line if (!line.startsWith(">")) { throw new DataFormatException( "PWM sequence is not in the correct format. Each PWM should be preceded by a line starting with \"> pwmName\""); } line = line.substring(1); StringTokenizer st = new StringTokenizer(line, " \t"); if (st.hasMoreTokens()) { pwmName = st.nextToken(); } else { throw new DataFormatException( "PWM sequence is not in the correct format. PWM name should follow \">\"."); } } else { // expecting a pwm line // if (line.matches("[ACGT][\\t ]+\\|([\\t ]+[0-9]+)+")){ line = line.replace('[', '|'); int len = line.length(); line = line.substring(0, len - 1); pwmString = pwmString + line + "\n"; pwmLineCount++; // } else { // throw new DataFormatException("At least one of the PWMs is not in the correct format. // "); // } } if (!pwmName.isEmpty() && pwmLineCount == 4) { // got all the info for writing a matrix BufferedWriter writer = new BufferedWriter(new FileWriter(dirName + pwmName + ".matrix")); writer.write(pwmString); writer.close(); pwmName = ""; pwmLineCount = 0; pwmString = ""; } } } } @Override public ArrayList<Feature> getRegulatoryElements( ArrayList<Feature> regRegions, ArrayList<Feature> backgroundRegRegions, double cutOffScore) throws DataFormatException { throw new DataFormatException("This method is not applicable to Patser Service"); } }
/* Finds regulatory elements in regulatory sequences * @param regRegions - list of regulatory regions in which to look for the reg. elements * @param cutOffScore - minimum score cut off (-ls option in Patser). Matches with lower * score will not be accepted */ public ArrayList<Feature> getRegulatoryElements( ArrayList<Feature> regRegions, double cutOffScore, double atComposition, double cgComposition) throws DataFormatException { /* Verified: * When Patser matches an "R" sequence (reverse complement), it reports a start position in the * original sequence of the reverse complement of a PSSM. */ ///// Patser parameters : // double cutOffScore = SystemVariables.getInstance().getString(""); ArrayList<Feature> tfbsHits = new ArrayList<Feature>(); // *** Write genes into file acceptable by PATSER final String seqFileName = createSequencesFile(regRegions); // *** Get file names of all matrixes final String[] matrixFileNames = new File(matrixFilesDir).list(); pssmMatchingStats = new Hashtable<String, Double>(); for (int i = 0; i < matrixFileNames.length; i++) { // Only read files with .matrix extension if (!matrixFileNames[i].endsWith(SystemVariables.getInstance().getString("pwm.extension"))) { continue; } // *** Run PATSER tool and extract tfbs info from its output try { Runtime rt = Runtime.getRuntime(); // Process pr = rt.exec("cmd /c dir"); // Process pr = rt.exec("pwd"); File patserDir = new File(patserInstallDirName); Process pr; if (System.getProperty("os.name").startsWith("Mac")) { final String cmd = "./patser-v3e -A a:t " + atComposition + " c:g " + cgComposition + " -b 1 -c -d1 -ls " + cutOffScore + " -p -s -m " + matrixFilesDir + "/" + matrixFileNames[i] + " -f " + seqFileName; pr = rt.exec(new String[] {"/bin/sh", "-c", cmd}, null, patserDir); // pr = rt.exec(cmd); // pr = rt.exec( new String[] { "patser-v3e", "-A", "a:t 0.25 c:g 0.25", "-b", "1", "-c", // "-d1", "-ls", "7", "-p", "-s", "-m", matrixFileNames[i], "-f", seqFileName } ); } else { // i.e. Windows String patserCommand; // Stupid windows hack if (matrixFilesDir.endsWith("/")) { patserCommand = "cmd /c patser-v3e -A a:t " + atComposition + " c:g " + cgComposition + " -b 1 -c -d1 -ls " + cutOffScore + " -p -s -m \"" + matrixFilesDir + matrixFileNames[i] + "\" -f \"" + seqFileName + "\""; } else { patserCommand = "cmd /c patser-v3e -A a:t " + atComposition + " c:g " + cgComposition + " -b 1 -c -d1 -ls " + cutOffScore + " -p -s -m \"" + matrixFilesDir + "\\" + matrixFileNames[i] + "\" -f \"" + seqFileName + "\""; } // String patserCommand = "cmd /c patser-v3e -A a:t 0.25 c:g 0.25 -b 1 -c -d1 -ls 7 -p // -s" // + " -m " + matrixFilesDir + matrixFileNames[i] + " -f " + seqFileName; pr = rt.exec(patserCommand, new String[] {"PATH=C:/cygwin/bin"}, patserDir); } BufferedReader input = new BufferedReader(new InputStreamReader(pr.getInputStream())); String line = null; String pssmName = matrixFileNames[i].substring(0, matrixFileNames[i].length() - 7); int hits = 0; String lastGeneName = ""; // TODO String word; // Read the input and filter out matrix hits data while ((line = input.readLine()) != null) { // System.out.println(line); if (line.contains("position=") && line.contains("score=")) { // result line Feature regElement = new Feature("TF_binding_site"); // Set tfbs name, but filter the .matrix extension regElement.setName(pssmName); // hit.setNameAttribute(matrixFileNames[i]); StringTokenizer st = new StringTokenizer(line); // first one is the name word = st.nextToken(); regElement.setParent(word); while (st.hasMoreTokens()) { word = st.nextToken(); if (word.equals("position=")) { word = st.nextToken(); if (word.endsWith("C")) { regElement.setStrand("R"); word = word.substring(0, word.indexOf("C")); } else { regElement.setStrand("D"); } regElement.setStartPosition(Integer.parseInt(word)); } else if (word.equals("score=")) { word = st.nextToken(); regElement.setScore(Double.parseDouble(word)); } else if (word.equals("sequence=")) { word = st.nextToken(); regElement.setSequence(word); // Now that we have TFBS sequence, we can set the end position of TFBS: regElement.setEndPosition(regElement.getStartPosition() + word.length()); } } // end of tokenizer // This is to calculate statistics of PSSM hits // if (!lastGeneName.equals(regElement.getParent())){ hits++; lastGeneName = regElement.getParent(); // } tfbsHits.add(regElement); } } // end of reading patser input pssmMatchingStats.put(pssmName, (double) hits / regRegions.size()); // int exitVal = pr.waitFor(); // System.out.println("Exited with error code " + exitVal); } catch (IOException e) { e.printStackTrace(); throw new DataFormatException("PATSER execution failed. "); } } File seqFile = new File(seqFileName); seqFile.delete(); /* // add begin and end of sequence "fake" matches, or markers; like ^ and $ for begin and ed sequence in regEx // this is for ILP to be able to induce rules with begin and end of sequence (distance, etc.) for (Iterator<Feature> iterator = regRegions.iterator(); iterator.hasNext();) { Feature feature = (Feature) iterator.next(); // markers are place before the sequence begins and after it ends tfbsHits.add(new Feature("generic_marker", "begin", feature.getId(), "D", 0, 0, 0.0)); tfbsHits.add(new Feature("generic_marker", "end", feature.getId(), "D", feature.getSequence().length()+2, feature.getSequence().length()+2, 0.0)); } */ if (tfbsHits.isEmpty()) { throw new DataFormatException("PATSER did not locate any motifs in the data."); } return tfbsHits; }