コード例 #1
0
ファイル: Explorer.java プロジェクト: turcotte/moduleinducer
  public String induceRulesWithTestSet(double testSetPercent) throws DataFormatException {

    SystemVariables.getInstance()
        .appendToExperimentNotes("\\n* Total number of PSSM matches in positive sequences: ");
    // System.out.println("Total number of PSSM matches in positive sequences: ");
    ArrayList<Feature> posTrainingSet =
        regulatoryElementService.getRegulatoryElements(
            regulatoryRegionService.getPositiveRegulatoryRegions(),
            positiveCutOffScore,
            posATcomposition,
            posCGcomposition);

    SystemVariables.getInstance()
        .appendToExperimentNotes("\\n* Total number of PSSM matches in negative sequences: ");
    // System.out.println("Total number of PSSM matches in negative sequences: ");
    ArrayList<Feature> negTrainingSet =
        regulatoryElementService.getRegulatoryElements(
            regulatoryRegionService.getNegativeRegulatoryRegions(),
            negativeCutOffScore,
            negATcomposition,
            negCGcomposition);

    IlpService formatter = new IlpService(tempIlpJobDirName);
    formatter.createIlpFilesWithTestSet(
        regulatoryRegionService.getPositiveRegulatoryRegions(),
        regulatoryRegionService.getNegativeRegulatoryRegions(),
        posTrainingSet,
        negTrainingSet,
        testSetPercent);

    String ilpTheory = formatter.runILP();

    return ilpTheory;
  }
コード例 #2
0
  @Override
  public void updateNumberOfPositiveRegRegions(int num) throws DataFormatException {
    // TODO check for empty regions
    // TODO test this
    int deltaSeqNum = num - positiveRegRegions.size();

    if (deltaSeqNum < 0) { // desired number of positive sequences is less than is available now
      for (int i = 0; i > deltaSeqNum; i--) {
        positiveRegRegions.remove(0);
      }
    } else { // desired number is more than available -> need to generate extra
      ArrayList<Feature> extraSimulatedGenes =
          FeaturesTools.generateSimulatedRegulatoryRegionsWithPositionalPSSMs(
              deltaSeqNum,
              regulatorySequenceLength,
              sequenceNtProbabilities,
              SystemVariables.getInstance().getPositivePatserCutOffScore(),
              POSITIVE_REGION_PREFIX,
              modelRegRegionService.getPositiveRegulatoryRegions(),
              modelRegElService);

      // generateRegulatoryRegions(deltaSeqNum, POSITIVE_REGION_PREFIX);

      negativeRegRegions.addAll(extraSimulatedGenes);
    }
  }
コード例 #3
0
ファイル: Explorer.java プロジェクト: turcotte/moduleinducer
  public IlpService createIlpFiles() throws DataFormatException {

    ArrayList<Feature> posRegElements =
        regulatoryElementService.getRegulatoryElements(
            regulatoryRegionService.getPositiveRegulatoryRegions(),
            positiveCutOffScore,
            posATcomposition,
            posCGcomposition);

    if (posRegElements == null) {
      posRegElements =
          regulatoryElementService.getRegulatoryElements(
              regulatoryRegionService.getPositiveRegulatoryRegions(),
              regulatoryRegionService.getNegativeRegulatoryRegions(),
              positiveCutOffScore);
    }

    // ***** Add statistics
    addMotifMatchingStatisticsToNotes("positive");
    SystemVariables.getInstance().setPosSeqRelElMatchesNum(posRegElements.size());

    ArrayList<Feature> negRegElements =
        regulatoryElementService.getRegulatoryElements(
            regulatoryRegionService.getNegativeRegulatoryRegions(),
            negativeCutOffScore,
            negATcomposition,
            negCGcomposition);

    if (negRegElements == null) {
      negRegElements =
          regulatoryElementService.getRegulatoryElements(
              regulatoryRegionService.getNegativeRegulatoryRegions(), null, negativeCutOffScore);
    }

    // ***** Add statistics
    addMotifMatchingStatisticsToNotes("negative");
    SystemVariables.getInstance().setNegSeqRelElMatchesNum(negRegElements.size());

    /*
    	System.out.println("\n Number of matches of pairs of matrices in jurkat and erythroid sequences");

    	String[] pwmPair = new String[]{"MA0055.1", "MA0152.1"};
    	System.out.println(pwmPair[0] + " and " + pwmPair[1]+" :\t" +
    			FeaturesTools.numSequencesContainingAll(pwmPair, posRegElements) + "\t"+
    			FeaturesTools.numSequencesContainingAll(pwmPair, negRegElements));

    	System.out.println("\nDistances between Gata and Ebox for positive sequences:");
    	FeaturesTools.printRegElDistances("Gata", "Ebox", posRegElements);
    */

    IlpService ilpService = new IlpService(tempIlpJobDirName);
    ilpService.createIlpFiles(
        regulatoryRegionService.getPositiveRegulatoryRegions(),
        regulatoryRegionService.getNegativeRegulatoryRegions(),
        posRegElements,
        negRegElements);

    return ilpService;
  }
コード例 #4
0
  public SyntheticRegRegionService(int numOfPositiveRegRegions, int negExMultiplicationFactor)
      throws DataFormatException {

    modelRegRegionService = new CElegansRegRegionService(0);

    File pwmDir = new File(SystemVariables.getInstance().getString("C.elegans.PWMs.dir"));
    String tmpJobDir =
        SystemVariables.getInstance()
            .getString("temp.output.dir"); // TODO create a proper tmp job dir for experiments

    modelRegElService = new PatserRegElementService(pwmDir, tmpJobDir);

    if (modelRegRegionService.getPositiveRegulatoryRegions().size() > 0) {
      regulatorySequenceLength =
          modelRegRegionService.getPositiveRegulatoryRegions().get(0).getSequence().length();
    } else {
      throw new DataFormatException(
          "Model regulatory regions are empty. Can not model synthetic regions based on an empty regions.");
    }

    // TODO: remove - just for testing
    regulatorySequenceLength = 25;
    negativeRegRegions =
        FeaturesTools.generateSimulatedRegulatoryRegions(
            negExMultiplicationFactor * numOfPositiveRegRegions,
            regulatorySequenceLength,
            NEGATIVE_REGION_PREFIX,
            sequenceNtProbabilities);

    positiveRegRegions =
        FeaturesTools.generateSimulatedRegulatoryRegionsWithPositionalPSSMs(
            numOfPositiveRegRegions,
            regulatorySequenceLength,
            sequenceNtProbabilities,
            SystemVariables.getInstance().getPositivePatserCutOffScore(),
            POSITIVE_REGION_PREFIX,
            modelRegRegionService.getPositiveRegulatoryRegions(),
            modelRegElService);
    // this.generateRegulatoryRegions(numOfPositiveRegRegions, );

  }
コード例 #5
0
ファイル: Explorer.java プロジェクト: turcotte/moduleinducer
  /* Note: unsound method
   * Based on the statistics, collected when extracting motif matches by RegElementService,
   * collects text note of motif matching statistics, which will be added to ILP file.
   * This method has to be called immediately after RegElementService.getRegulatoryElements call,
   * since every such call overrides the pssmMatchStatistics in RegElementService
   *
   * @param sequenceKind - can only be "positive" or "negative", depending on the kind of sequences
   */
  private void addMotifMatchingStatisticsToNotes(String sequenceKind) throws DataFormatException {

    Hashtable<String, Double> pssmMatchStats = regulatoryElementService.getPssmMatchingStatistics();
    if (pssmMatchStats != null && !pssmMatchStats.isEmpty()) {

      SystemVariables.getInstance()
          .appendToExperimentNotes(
              "\\n* Number of PSSM matches in " + sequenceKind + " sequences: ");
      int numberOfRegions = 0;
      if ("positive".equals(sequenceKind)) {
        numberOfRegions = regulatoryRegionService.getPositiveRegulatoryRegions().size();
      } else {
        numberOfRegions = regulatoryRegionService.getNegativeRegulatoryRegions().size();
      }

      Enumeration<String> pssmMatchNames = pssmMatchStats.keys();
      int totalNumMatches = 0;
      String r_pssmNames = "";
      String r_matches = "";
      while (pssmMatchNames.hasMoreElements()) {
        String pssmName = pssmMatchNames.nextElement();
        double pssmStat = pssmMatchStats.get(pssmName);
        int numMatches = (int) (pssmStat * numberOfRegions);
        totalNumMatches = totalNumMatches + numMatches;

        r_pssmNames = r_pssmNames + "\'" + pssmName + "\', ";
        r_matches = r_matches + numMatches + ", ";
        SystemVariables.getInstance()
            .appendToExperimentNotes("\\n\\t" + pssmName + "\\t" + numMatches);
      }
      SystemVariables.getInstance()
          .appendToExperimentNotes("\\n Total number of matches: " + totalNumMatches);

      //			System.out.println("In positive sequences: ");
      //			System.out.println(r_pssmNames);
      //			System.out.println(r_matches);
    }
  }
コード例 #6
0
  /* Generates regulatory regions for ILP positive examples.
   * Base of the sequence is random ACGT 0.25 : 0.25 : 0.25 : 0.25 composition,
   * cut-off score for finding PSSMs in model sequences is a default positive cut-off
   * C.elegans pwm are planted in the base sequence at random positions.
   *
   * @param int number of regions to be generates
   * @param String generic prefix for the name of regulatory sequence
   */
  private ArrayList<Feature> generateRegulatoryRegions(int numOfRegions, String regionNamePrefix)
      throws DataFormatException {
    ArrayList<Feature> regRegions = new ArrayList<Feature>();

    if (numOfRegions <= 0) {
      return regRegions;
    }

    // Generate probabilities based on C.elegans data
    // RegulatoryRegionService regRegionService =
    //	RegulatoryRegionServiceFactory.getService(RegulatoryRegionService.C_ELEGANS_DATA_SERVICE, 0,
    // 0);

    ArrayList<Feature> cElegansRegRegions = modelRegRegionService.getPositiveRegulatoryRegions();
    if (cElegansRegRegions == null) {
      throw new DataFormatException(
          "SyntheticRegRegionService: Unable to retrieve C.elegans regulatory regions to generate positive sequences.");
    }

    File pwmDir = new File(SystemVariables.getInstance().getString("C.elegans.PWMs.dir"));
    String tmpJobDir =
        SystemVariables.getInstance()
            .getString("temp.output.dir"); // TODO create a proper tmp job dir for experiments

    PatserRegElementService regElService = new PatserRegElementService(pwmDir, tmpJobDir);

    double cutOffScore = SystemVariables.getInstance().getPositivePatserCutOffScore();
    double atComposition = sequenceNtProbabilities[0] + sequenceNtProbabilities[3];
    double cgComposition = sequenceNtProbabilities[1] + sequenceNtProbabilities[2];
    ArrayList<Feature> regElements =
        regElService.getRegulatoryElements(
            cElegansRegRegions, cutOffScore, atComposition, cgComposition);

    ArrayList<RegulatoryElementPWM> regElementPWMs = regElService.getRegulatoryElementsPWMs();

    StatAnalyser stat = new StatAnalyser(regElements);

    for (int i = 0; i < numOfRegions; i++) {
      String sequence =
          generatePosExSequence(
              stat,
              regElementPWMs,
              cElegansRegRegions.get(0).getSequence().length(),
              cElegansRegRegions.size());
      regRegions.add(new Feature(regionNamePrefix + i, "gene", null, 0, 0, sequence, 0.0));
    }

    return regRegions;
  }
コード例 #7
0
ファイル: Explorer.java プロジェクト: turcotte/moduleinducer
  public Explorer(
      RegulatoryRegionService regRegionService,
      RegulatoryElementService regElService,
      String tempIlpJobDirName)
      throws DataFormatException {
    this.regulatoryRegionService = regRegionService;
    this.regulatoryElementService = regElService;
    this.tempIlpJobDirName = tempIlpJobDirName;

    positiveCutOffScore = SystemVariables.getInstance().getPositivePatserCutOffScore();
    negativeCutOffScore = SystemVariables.getInstance().getNegativePatserCutOffScore();

    ///////  Statistics
    SystemVariables.getInstance().cleanStatistics(); // clean old;

    SystemVariables.getInstance()
        .setPosSeqNum(regulatoryRegionService.getPositiveRegulatoryRegions().size());
    SystemVariables.getInstance()
        .setNegSeqNum(regulatoryRegionService.getNegativeRegulatoryRegions().size());

    SystemVariables.getInstance()
        .appendToExperimentNotes(
            "\\n* Number of positive sequences: "
                + regulatoryRegionService.getPositiveRegulatoryRegions().size());
    SystemVariables.getInstance()
        .appendToExperimentNotes(
            "\\n* Number of negative sequences: "
                + regulatoryRegionService.getNegativeRegulatoryRegions().size());

    SystemVariables.getInstance()
        .appendToExperimentNotes(
            "\\n* Patser cut-off score for positive sequences: " + positiveCutOffScore);
    SystemVariables.getInstance()
        .appendToExperimentNotes(
            "\\n* Patser cut-off score for negative sequences: " + negativeCutOffScore);

    double[] posNtComposition =
        FeaturesTools.getNucleotideComposition(
            regulatoryRegionService.getPositiveRegulatoryRegions());
    posATcomposition = posNtComposition[0] + posNtComposition[3];
    posCGcomposition = posNtComposition[1] + posNtComposition[2];
    SystemVariables.getInstance()
        .appendToExperimentNotes(
            "\\n* A:T and C:G composition of positive sequences: "
                + posATcomposition
                + " "
                + posCGcomposition);

    SystemVariables.getInstance().setPosATcomposition(posATcomposition);
    SystemVariables.getInstance().setPosCGcomposition(posCGcomposition);

    double[] negNtComposition =
        FeaturesTools.getNucleotideComposition(
            regulatoryRegionService.getNegativeRegulatoryRegions());
    negATcomposition = negNtComposition[0] + negNtComposition[3];
    negCGcomposition = negNtComposition[1] + negNtComposition[2];
    SystemVariables.getInstance()
        .appendToExperimentNotes(
            "\\n* A:T and C:G composition of negative sequences: "
                + negATcomposition
                + " "
                + negCGcomposition);

    SystemVariables.getInstance().setNegATcomposition(negATcomposition);
    SystemVariables.getInstance().setNegCGcomposition(negCGcomposition);
  }