#!/usr/bin/perl -w # # ---------- phymlrun ---------- # a program to concvert multiple alignment files to something that can be read by the ML phylogeny program phyml # then to run phyml # David Lunt, August 2006 # modified from the excellent seqConverter.pl v1.0 # (c) Olaf R.P. Bininda-Emonds # # Input: # Sequence data in any of fasta, nexus, (classic or extended) phylip, or Se-Al formats # # Output: # Sequence data in phyml compatible phylip format, and output from phyml run on that file # # Usage: perl phymlrun.pl inputfile use strict; # Set user-input defaults and associated parameters # Data set variables my $inputType = ""; # Options are "fasta", "nexus", "phylip", and "Se-Al" my $seqFile = ""; my $dataSource; my $globalGenCode = 1; my (@accNum, %nameLabel, %sequence, %geneticCode, %accPresent); my (%deletedSeq, %finalSeq); my $seqCount; # User input variables my $seqOrder = "input"; # Options are "alphabetical" and "input" (default) changed! # Output variables my $maxLength = 0; my $fastaPrint = 0; my $fastaOut; my $nexusPrint = 0; my $nexusOut; my ($phylipTradPrint, $phylipExtPrint) = (1, 0); my $phylipOut; my $sealPrint = 0; my $sealOut; # Miscellaneous variables my $verbose = 0; my $debug = 0; my $owner = 0; # Variables relating to running phyml # my $phymlfile = $phylipOut; #taken from conversion part of program my $datatype ="0"; #default is DNA my $seqformat ="s"; my $no_datasets ="1"; my $no_boostrapped ="0"; my $subst_model ="GTR"; my $tstv_ratio ="e"; my $prop_inv ="e"; my $no_rates ="4"; my $gamma ="e"; my $start_tree ="BIONJ"; my $opt_topol ="y"; my $opt_brlength ="y"; unless (@ARGV) #check that a file is specified { die "\nUsage: perl convphyml.pl inputfile.fas"; } $seqFile = $ARGV[1]; #$ARGV[0] if not a droppable script in Platypus $dataSource = $seqFile; $dataSource =~ s/\.\w+$//; print "\n"; print "-------------------------------------------------------------------\n"; print " runPHYML\n\n"; print "reads data file in fasta, phylip or nexus format\n"; print "and writes PHYML-compatible phylip file\n"; print "then runs the ML program PHYML on this file using a GTR model\n"; print "output tree is saved in same folder as the input file\n\n"; print "David Lunt, August 2006\n"; print "-------------------------------------------------------------------\n\n"; die "ERROR: Must supply name of file containing sequence data.\n" if (not $seqFile); # die "ERROR: Must supply at least one output format.\n" unless ($fastaPrint or $nexusPrint or $phylipTradPrint or $phylipExtPrint or $sealPrint); if ($fastaPrint) { $fastaOut = $dataSource.".fasta"; $fastaOut =~ s/\.fasta$/_new.fasta/ if ($fastaOut eq $seqFile); } if ($nexusPrint) { $nexusOut = $dataSource.".nex"; $nexusOut =~ s/\.nex$/_new.nex/ if ($nexusOut eq $seqFile); } if ($phylipTradPrint or $phylipExtPrint) { $phylipOut = $dataSource.".phylip"; $phylipOut =~ s/\.phylip$/_new.phylip/ if ($phylipOut eq $seqFile); } if ($sealPrint) { $sealOut = $dataSource.".seal"; $sealOut =~ s/\.seal$/_new.seal/ if ($sealOut eq $seqFile); } $phylipExtPrint = 0 if ($phylipTradPrint); # Read in sequence data seqRead($seqFile); die "\nERROR: Could not read in sequences from file $seqFile\n" if (not @accNum); # Process for printing foreach my $seq (@accNum) { $finalSeq{$seq} = $sequence{$seq}; $maxLength = length($sequence{$seq}) if (length($sequence{$seq}) > $maxLength); } # Print results! my $ntax = scalar(@accNum); @accNum = sort { $nameLabel{$a} cmp $nameLabel{$b} } keys %nameLabel if ($seqOrder eq "alphabetical"); print "Printing results ...\n"; seqPrint(); # exit(0); ### Subroutines used in the program sub setLineBreak # Check line breaks of input files and set input record separator accordingly { my $inFile = shift; $/ ="\n"; open (IN, "<$inFile") or die "Cannot open $inFile to check form of line breaks.\n"; while () { if ($_ =~ /\r\n/) { print "\tDOS line breaks detected ...\n" if ($verbose); $/ ="\r\n"; last; } elsif ($_ =~ /\r/) { print "\tMac line breaks detected ...\n" if ($verbose); $/ ="\r"; last; } else { print "\tUnix line breaks detected ...\n" if ($verbose); $/ ="\n"; last; } } close IN; } sub seqRead { my $seqFile = shift; # if ($inputType) # { # print "\nReading in sequence data ...\n"; # print "\tInput file is $seqFile (type is $inputType)\n"; # } print "\nReading in sequence data from file $seqFile (type is $inputType) ...\n" if ($inputType); setLineBreak($seqFile); open (SEQ, "<$seqFile") or die "Cannot open file containing sequences, $seqFile\n"; my ($header, $tempAcc, $tempName, $tempSeq); my $fastaAcc; my (%nexusSpecies, %nexusAcc, $nexusRead); my ($phylipLineCount, $phylipTaxa, $phylipChars, %phylipSeq); my $sealCode; my ($sealDelFlag, $onwer) = (0, 0); while () { chomp; my $lineRead = $_; next unless ($lineRead); # Autodetect sequence format if (not $inputType) { $inputType = "fasta" if ($lineRead =~ /^>/); $inputType = "nexus" if ($lineRead =~ /\#nexus/i); $inputType = "phylip" if ($lineRead =~ /^\s*\d+\s+\d+/); $inputType = "Se-Al" if ($lineRead =~ /^\s*Database=\{/i); print "\nReading in sequence data ...\n"; print "\tinput file is $seqFile\n"; print "\ttype determined to be $inputType\n" if ($inputType); } if ($inputType eq "nexus") { # Only read in data lines if ($lineRead =~ /^\s*matrix/i) { $nexusRead = 1; next; } $nexusRead = 0 if ($lineRead =~ /;\s*$/); next unless ($nexusRead); next unless ($lineRead =~ /a/i or $lineRead =~ /c/i or $lineRead =~ /g/i or $lineRead =~ /t/i); # Clean up input line $lineRead =~ s/^\s+//; $lineRead =~ s/\'//g; my ($species, $seq) = split(/\s+/, $lineRead); $species =~ s/\s+/_/g; if (not defined $nexusSpecies{$species}) { $nexusSpecies{$species} = 1; $seqCount++; $nexusAcc{$species} = "tAlign_".$seqCount; push @accNum, $nexusAcc{$species}; $nameLabel{$nexusAcc{$species}} = $species; $sequence{$nexusAcc{$species}} = uc($seq); $geneticCode{$nexusAcc{$species}} = $globalGenCode; } else # Sequences are in interleaved format; append sequence { $sequence{$nexusAcc{$species}} .= uc($seq); } } if ($inputType eq "fasta") { if ($lineRead =~/^\s*>/) { my $species; $seqCount++; (my $tempSpecies = $lineRead) =~ s/^\s*>//; if ($tempSpecies =~ /^Mit\.\s+/) # Entry comes from European RNA project { $tempSpecies =~ s/^Mit\.\s+//i; # To fix entries from European RNA project my @speciesInfo = split(/\s+/, $tempSpecies); $species = join('_', $speciesInfo[0], $speciesInfo[1]); if (defined $speciesInfo[2]) { $fastaAcc = $speciesInfo[2]; } else { $fastaAcc = "tAlign_".$seqCount; } } else { my @speciesLine = split(/\s+/, $tempSpecies); if ($speciesLine[$#speciesLine] =~ /^\(?[A-Z]+\d+\)?$/ and scalar(@speciesLine) > 1) # Check whether last entry is an accession number { $fastaAcc = pop (@speciesLine); $fastaAcc =~ s/^\(//g; $fastaAcc =~ s/\)$//g; } else { $fastaAcc = "tAlign_".$seqCount; } $species = join('_', @speciesLine); $species = "Sequence_".$seqCount if ($species eq ""); } push @accNum, $fastaAcc; $geneticCode{$fastaAcc} = $globalGenCode; $nameLabel{$fastaAcc} = $species; } else { $sequence{$fastaAcc} .= uc($lineRead); } } if ($inputType eq "Se-Al") { my $header; $sealDelFlag = 1 if ($lineRead =~/MCoL/); # Se-Al sometimes places deleted species at end of file; do not read in remainder of file next if ($sealDelFlag == 1); next unless ($lineRead =~/NumSites/i or $lineRead =~/Owner/i or $lineRead =~/Name/i or $lineRead =~/Accession/i or $lineRead =~/Sequence/i or $lineRead =~/GeneticCode/i); if ($lineRead =~/Owner\s*\=\s*(\d+)/i) { $owner = $1; } if ($lineRead =~/Accession/i and $owner == 2) { $seqCount++; if ($lineRead =~ /null/ or $lineRead =~ /\"\"/) { $tempAcc = "tAlign_".$seqCount; } else { ($header, $tempAcc) = split (/=/, $lineRead); $tempAcc =~ s/\"//g; $tempAcc =~ s/;//g; } push @accNum, $tempAcc; } if ($lineRead =~/Name/i and $owner == 2) { ($header, $tempName) = split (/=/, $lineRead); $tempName =~ s/\"//g; $tempName =~ s/\s*;//g; } if ($lineRead =~/GeneticCode/i and $owner == 2) { ($header, $sealCode) = split (/=/, $lineRead); $sealCode =~ s/\"//g; $sealCode =~ s/\s*;//g; $geneticCode{$tempAcc} = $sealCode + 1; } if ($lineRead =~/Sequence/i and $owner == 2) { ($header, $tempSeq) = split (/=/, $lineRead); $tempSeq =~ s/\"//g; $tempSeq =~ s/;//g; $nameLabel{$tempAcc} = $tempName; $sequence{$tempAcc} = uc($tempSeq); } } if ($inputType eq "phylip") { if ($lineRead =~ /^\s*(\d+)\s+(\d+)/) { $phylipTaxa = $1; $phylipChars = $2; $phylipLineCount = 0; } else { $phylipLineCount++; $lineRead =~ s/\s//g; $phylipSeq{$phylipLineCount} .= $lineRead; $phylipLineCount = 0 if ($phylipLineCount == $phylipTaxa); } } } close SEQ; if ($inputType eq "phylip") # Postprocess input to derive taxon names and sequence; accounts for both sequential and extended formatting { for (my $i = 1; $i <= $phylipTaxa; $i++) { my $phylipAcc = "tAlign_" . $i; push @accNum, $phylipAcc; $geneticCode{$phylipAcc} = $globalGenCode; # Derive taxon name and sequence $sequence{$phylipAcc} = uc(substr($phylipSeq{$i}, 0 - $phylipChars)); $nameLabel{$phylipAcc} = substr($phylipSeq{$i}, 0, length($phylipSeq{$i}) - $phylipChars); $nameLabel{$phylipAcc} =~ s/\s+//g; } } } sub seqPrint { # Print fasta-formatted file (always) if ($fastaPrint) { print "\tWriting to fasta-formatted file $fastaOut ...\n"; open (FASTA, ">$fastaOut") or die "Cannot open fasta file for aligned DNA sequences, $fastaOut"; foreach my $entry (@accNum) { next if ($deletedSeq{$entry}); my $fastaSeq = $finalSeq{$entry}; my $breakPoint = 79; until ($breakPoint > length($fastaSeq)) { my $replaceString = "\n" . substr($fastaSeq, $breakPoint, 1); substr($fastaSeq, $breakPoint, 1) = $replaceString; $breakPoint += 80; } print FASTA ">$nameLabel{$entry}"; print FASTA "\t($entry)" unless ($entry =~ /^tAlign/); print FASTA "\n$fastaSeq\n"; } close FASTA; } # Print nexus-formatted file (on demand) if ($nexusPrint) { print "\tWriting to nexus file $nexusOut ...\n"; open (NEX, ">$nexusOut") or die "Cannot open nexus file for aligned DNA sequences, $nexusOut"; print NEX "#nexus\n\n"; print NEX "[File created from $seqFile using seqConverter.pl v1.0 on ".localtime()."]\n\n"; print NEX "begin data;\n"; print NEX "\tdimensions ntax = $ntax nchar = $maxLength;\n"; print NEX "\tformat datatype = DNA gap = - missing = ?;\n\n"; print NEX "\tmatrix\n\n"; foreach my $entry (@accNum) { next if ($deletedSeq{$entry}); if ($nameLabel{$entry} =~ /\W/) { print NEX "'$nameLabel{$entry}'"; } else { print NEX "$nameLabel{$entry}"; } print NEX "\t$finalSeq{$entry}\n"; } print NEX "\t;\nend;\n"; close NEX; } # Print phylip-formatted file (on demand) - this is now phyml compatible if ($phylipTradPrint or $phylipExtPrint) { my $maxTaxLength = 100; #this is now 100 not 10 since phyml is now compatible with long names $maxTaxLength = 100 if ($phylipTradPrint); #this is now 100 not 10 since phyml is now compatible with long names my %shortNameCount; print "\tWriting phylip ('phyml') format file $phylipOut ...\n\n"; open (PHYLIP, ">$phylipOut") or die "Cannot open phylip file for aligned DNA sequences, $phylipOut"; print PHYLIP "\t$ntax\t$maxLength\n"; foreach my $entry (@accNum) { next if ($deletedSeq{$entry}); my $phylipName = $nameLabel{$entry}; # Check name label and adjust to proper length if needed if (length($phylipName) < $maxTaxLength) { $shortNameCount{$phylipName}++; $phylipName .= " " x ($maxTaxLength - length($phylipName)); # Pad end of name with spaces as needed } else { my $trimmedName = substr($phylipName, 0 , $maxTaxLength); $shortNameCount{$trimmedName}++; if ($shortNameCount{$trimmedName} > 1) # Check for duplicates among shortened names and make unique by adding numbers { $phylipName = substr($phylipName, 0, $maxTaxLength - length($shortNameCount{$trimmedName})); $phylipName .= $shortNameCount{$trimmedName}; $phylipName .= " " x ($maxTaxLength - length($phylipName)); # Pad end of name with spaces as needed } else { $phylipName = $trimmedName; } } print PHYLIP "$phylipName "; print PHYLIP " " if ($phylipExtPrint); print PHYLIP "$finalSeq{$entry}\n"; } close PHYLIP; } # Print Se-Al-formatted file (on demand) if ($sealPrint) { print "\tWriting to Se_Al file $sealOut ...\n"; open (SEAL, ">$sealOut") or die "Cannot open Se-Al file for aligned DNA sequences, $sealOut\n"; print SEAL "Database={\n"; print SEAL "\tID='MLst';\n"; print SEAL "\tOwner=null;\n"; print SEAL "\tName=null;\n"; print SEAL "\tDescription=null;\n"; print SEAL "\tFlags=0;\n"; print SEAL "\tCount=2;\n"; print SEAL "\t{\n\t\t{\n"; print SEAL "\t\t\tID='PAli';\n"; print SEAL "\t\t\tOwner=1;\n"; print SEAL "\t\t\tName=\"$seqFile\";\n"; print SEAL "\t\t\tDescription=null;\n"; print SEAL "\t\t\tFlags=0;\n"; print SEAL "\t\t\tNumSites=$maxLength;\n"; print SEAL "\t\t\tType=\"Nucleotide\";\n"; print SEAL "\t\t\tFeatures=null;\n"; print SEAL "\t\t\tColourMode=1;\n"; print SEAL "\t\t\tLabelMode=0;\n"; print SEAL "\t\t\ttriplets=false;\n"; print SEAL "\t\t\tinverse=true;\n"; print SEAL "\t\t\tCount=$ntax;\n"; print SEAL "\t\t\t{\n"; my $i = 0; foreach my $sequence (@accNum) { next if ($deletedSeq{$sequence}); $i++; print SEAL "\t\t\t\t{\n"; print SEAL "\t\t\t\t\tID='PSeq';\n"; print SEAL "\t\t\t\t\tOwner=2;\n"; print SEAL "\t\t\t\t\tName=\"$nameLabel{$sequence}\";\n"; print SEAL "\t\t\t\t\tDescription=null;\n"; print SEAL "\t\t\t\t\tFlags=0;\n"; print SEAL "\t\t\t\t\tAccession="; if ($sequence =~/^tAlign_/) { print SEAL "null;\n"; } else { print SEAL "$sequence;\n"; } print SEAL "\t\t\t\t\tType=\"DNA\";\n"; print SEAL "\t\t\t\t\tLength=".length($finalSeq{$sequence}).";\n"; print SEAL "\t\t\t\t\tSequence=\"$finalSeq{$sequence}\";\n"; my $sealCode = $geneticCode{$sequence} - 1; print SEAL "\t\t\t\t\tGeneticCode=$sealCode;\n"; print SEAL "\t\t\t\t\tCodeTable=null;\n"; print SEAL "\t\t\t\t\tFrame=1;\n"; print SEAL "\t\t\t\t\tFeatures=null;\n"; print SEAL "\t\t\t\t\tParent=null;\n"; print SEAL "\t\t\t\t\tComplemented=false;\n"; print SEAL "\t\t\t\t\tReversed=false;\n"; print SEAL "\t\t\t\t}"; print SEAL "," unless ($i == $ntax); print SEAL "\n"; } print SEAL "\t\t\t};\n"; print SEAL "\t\t},\n"; print SEAL "\t\t{\n"; print SEAL "\t\t\tID='MCoL';\n"; print SEAL "\t\t\tOwner=1;\n"; print SEAL "\t\t\tName=\"Genetic Codes\";\n"; print SEAL "\t\t\tDescription=\"Custom Genetic Codes\";\n"; print SEAL "\t\t\tFlags=0;\n"; print SEAL "\t\t\tCount=0;\n"; print SEAL "\t\t}\n"; print SEAL "\t};\n"; print SEAL "};\n"; close SEAL; } } # ----- runphyml.pl ------- # program to run phyml ml program with a set of default values and inputfile given on the commandline # print "\nstarting PHYML phylogeny program...\n"; # write a command to system system("phyml $phylipOut $datatype $seqformat $no_datasets $no_boostrapped $subst_model $tstv_ratio $prop_inv $no_rates $gamma $start_tree $opt_topol $opt_brlength"); #print "\nphyml $phylipOut $datatype $seqformat $no_datasets $no_boostrapped $subst_model $tstv_ratio $prop_inv $no_rates $gamma $start_tree $opt_topol $opt_brlength\n\n"; exit;