pax_global_header00006660000000000000000000000064122163613100014505gustar00rootroot0000000000000052 comment=dc82890936269ecd5fa4c4895f214c53f5fec307 PhyBin-0.3/000077500000000000000000000000001221636131000125405ustar00rootroot00000000000000PhyBin-0.3/.gitignore000066400000000000000000000000421221636131000145240ustar00rootroot00000000000000Rickettsia Rickettsiales WolbachiaPhyBin-0.3/Bio/000077500000000000000000000000001221636131000132515ustar00rootroot00000000000000PhyBin-0.3/Bio/Phylogeny/000077500000000000000000000000001221636131000152275ustar00rootroot00000000000000PhyBin-0.3/Bio/Phylogeny/PhyBin.hs000066400000000000000000000650301221636131000167600ustar00rootroot00000000000000{-# LANGUAGE ScopedTypeVariables, RecordWildCards, TypeSynonymInstances, CPP #-} {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} {-# OPTIONS_GHC -fwarn-incomplete-patterns #-} {-# OPTIONS_GHC -fwarn-unused-imports #-} -- | This module contains the code that does the tree normalization and binning. -- It's the heart of the program. module Bio.Phylogeny.PhyBin ( driver, binthem, normalize, annotateWLabLists, unitTests, acquireTreeFiles, deAnnotate, retrieveHighlights, matchAnyHighlight ) where import qualified Data.Foldable as F import Data.Function (on) import Data.List (delete, minimumBy, sortBy, foldl1', foldl', intersperse, isPrefixOf) import Data.Maybe (fromMaybe, catMaybes) import Data.Either (partitionEithers) import Data.Time.Clock import Data.Tuple (swap) import qualified Data.ByteString.Char8 as B import qualified Data.Map as M import qualified Data.Set as S import qualified Data.Text.Lazy.IO as T import qualified Data.Vector as V import qualified Data.Vector.Unboxed as U import Control.Monad (forM, forM_, filterM, when, unless) import qualified Control.Concurrent.Async as Async import Control.Exception (evaluate) import Control.Applicative ((<$>),(<*>)) import Control.Concurrent (Chan) import System.FilePath (combine) import System.Directory (doesFileExist, doesDirectoryExist, createDirectoryIfMissing, getDirectoryContents, getCurrentDirectory) import System.IO (openFile, hClose, IOMode(..), stdout, withFile) import System.Info (os) import System.Process (system) import System.Exit (ExitCode(..)) import Test.HUnit ((~:),(~=?),Test,test) import qualified Data.Clustering.Hierarchical as C import qualified Data.GraphViz as Gv -- For vizualization: import Text.PrettyPrint.HughesPJClass hiding (char, Style) import Data.GraphViz.Printing (renderDot) import Bio.Phylogeny.PhyBin.CoreTypes import Bio.Phylogeny.PhyBin.Parser (parseNewick, parseNewicks) import Bio.Phylogeny.PhyBin.PreProcessor (collapseBranchLenThresh, collapseBranchBootStrapThresh, pruneTreeLeaves) import Bio.Phylogeny.PhyBin.Visualize (dotToPDF, dotNewickTree, viewNewickTree, dotDendrogram) import Bio.Phylogeny.PhyBin.RFDistance import Bio.Phylogeny.PhyBin.Binning import Bio.Phylogeny.PhyBin.Util import Debug.Trace ---------------------------------------------------------------------------------------------------- -- Turn on for extra invariant checking: debug :: Bool debug = True #ifdef SEQUENTIALIZE #warning "SEQUENTIALIING execution. Disabling all parallelism." type Async t = t async a = a wait x = return x #else type Async t = Async.Async t async = Async.async wait = Async.wait #endif -- | A dendrogram PLUS consensus trees at the intermediate points. data DendroPlus a = DPLeaf (FullTree a) | DPBranch !Double (FullTree ()) (DendroPlus a) (DendroPlus a) ---------------------------------------------------------------------------------------------------- -- | Driver to put all the pieces together (parse, normalize, bin) driver :: PhyBinConfig -> IO () driver cfg@PBC{ verbose, num_taxa, name_hack, output_dir, inputs=files, do_graph, branch_collapse_thresh, bootstrap_collapse_thresh, highlights, dist_thresh, clust_mode, rfmode, preprune_labels, print_rfmatrix } = -- Unused: do_draw do -------------------------------------------------------------------------------- -- First, find out where we are and open the files: -------------------------------------------------------------------------------- cd <- getCurrentDirectory --putStrLn$ "PHYBIN RUNNING IN DIRECTORY: "++ cd bl <- doesDirectoryExist output_dir unless bl $ createDirectoryIfMissing True output_dir if isPrefixOf "mingw" os then -- TODO: Need a portable version of this. 'filemanip' would do: putStrLn$ "Not cleaning away previous phybin outputs (TODO: port this to Windows)." else do putStrLn$ "Cleaning away previous phybin outputs..." system$ "rm -f "++output_dir++"/dendrogram.*" system$ "rm -f "++output_dir++"/cluster*" system$ "rm -f "++output_dir++"/distance_matrix.txt" system$ "rm -f "++output_dir++"/WARNINGS.txt" return () putStrLn$ "Parsing "++show (length files)++" Newick tree files." --putStrLn$ "\nFirst ten \n"++ concat (map (++"\n") $ map show $ take 10 files) -------------------------------------------------------------------------------- -- Next, parse the files and do error checking and annotation. -------------------------------------------------------------------------------- (goodFiles,warnings1) <- fmap partitionEithers $ forM files $ \ file -> do reg <- is_regular_file file if reg then return$ Left file else return$ Right file let num_files = length goodFiles bstrs <- mapM B.readFile goodFiles let (labelTab, fulltrees) = parseNewicks name_hack (zip files bstrs) nameToLabel = M.fromList $ map swap $ M.toList labelTab highlightTrs <- retrieveHighlights name_hack labelTab highlights let allTaxaLs = M.elems labelTab totalTaxa = length allTaxaLs putStrLn$ "\nTotal unique taxa ("++ show (M.size labelTab) ++"):\n "++ (unwords allTaxaLs) expected_num_taxa <- case preprune_labels of Nothing -> return totalTaxa Just ls -> return (length ls) case rfmode of TolerantNaive -> return () HashRF -> putStrLn$ "Note: defaulting to expecting ALL "++show expected_num_taxa++" to be present.." -------------------------------------------------------------------------------- case bootstrap_collapse_thresh of Just thr -> putStrLn$" !+ Collapsing branches of bootstrap value less than "++show thr Nothing -> return () case branch_collapse_thresh of Just thr -> putStrLn$" !+ Collapsing branches of length less than "++show thr Nothing -> return () let do_one :: FullTree DefDecor -> IO (Int, [FullTree DefDecor], Maybe (Int, String)) do_one (FullTree treename lblAcc parsed) = do let pruned0 = parsed -- Important: MUST collapse bootstrap first if we're doing both: pruned1 = case bootstrap_collapse_thresh of Nothing -> pruned0 Just thr -> collapseBranchBootStrapThresh thr pruned0 pruned2 = case branch_collapse_thresh of Nothing -> pruned1 Just thr -> collapseBranchLenThresh thr pruned1 pruned3 = case preprune_labels of Nothing -> Just pruned2 Just lst -> pruneTreeLeaves (S.fromList (map lkup lst)) pruned2 lkup trnm = case M.lookup trnm nameToLabel of Nothing -> error$ "Tree name "++show trnm++" did not occur ANYWHERE in the input set." Just lab -> lab -- TEMPTOGGLE -- when False $ do putStrLn$ "DRAWING TREE" -- viewNewickTree "Annotated" (FullTree file lblAcc' annot) -- viewNewickTree "Normalized" (FullTree file lblAcc' normal) -- putStrLn$ "WEIGHTS OF NORMALIZED' CHILDREN: "++ -- show (map get_weight$ get_children normal) when verbose$ putStr "." case pruned3 of Nothing -> do when verbose$ putStrLn$ "\n WARNING: tree empty after preprocessing, discarding: "++ treename return (0, [], Just (0, treename)) Just pruned3' -> let numL = numLeaves pruned3' looksOK = return (numL, [FullTree treename lblAcc pruned3'], Nothing) discardIt = return (0, [], Just (numL, treename)) in case rfmode of TolerantNaive -> looksOK HashRF | numL == expected_num_taxa -> looksOK | otherwise -> do when verbose$ putStrLn$ "\n WARNING: tree contained unexpected number of leaves (" ++ show numL ++"): "++ treename discardIt results <- mapM do_one fulltrees let (counts::[Int], validtreess, pairs:: [Maybe (Int, String)]) = unzip3 results let validtrees = concat validtreess warnings2 = catMaybes pairs putStrLn$ "\nNumber of input tree files: " ++ show num_files -- mapM_ (print . displayStrippedTree) validtrees -- Debugging. case preprune_labels of Nothing -> return () Just lst -> putStrLn$ "PRUNING trees to just these taxa: "++show lst when (length warnings2 > 0) $ putStrLn$ "Number of bad/unreadable input tree files: " ++ show (length warnings2) putStrLn$ "Number of VALID trees (correct # of leaves/taxa): " ++ show (length validtrees) putStrLn$ "Total tree nodes contained in valid trees: "++ show (sum counts) let all_branches = concatMap (F.foldr' (\x ls -> getBranchLen x:ls) []) validtrees unless (null all_branches) $ do putStrLn$ "Average branch len over valid trees: "++ show (avg all_branches) putStrLn$ "Max/Min branch lengths: "++ show (foldl1' max all_branches, foldl1' min all_branches) -------------------------------------------------------------------------------- -- Next, dispatch on the mode and do the actual clustering or binning. -------------------------------------------------------------------------------- (classes,binlist,asyncs) <- case clust_mode of BinThem -> do x <- doBins validtrees -- A list of bins, sorted by size: let binlist = reverse $ sortBy (compare `on` fst) $ map (\ (tr,OneCluster ls) -> (length ls, OneCluster ls)) $ M.toList x return (x,binlist,[]) ClusterThem{linkage} -> do (mat, dendro) <- doCluster (rfmode==HashRF) expected_num_taxa linkage validtrees -------------------- when print_rfmatrix $ printDistMat stdout mat withFile (combine output_dir ("distance_matrix.txt")) WriteMode $ \ hnd -> printDistMat hnd mat -------------------- writeFile (combine output_dir ("dendrogram.txt")) (show$ fmap treename dendro) putStrLn " [finished] Wrote full dendrogram to file dendrogram.txt" sanityCheck dendro -------------------- let plotIt mnameMap = if True -- do_graph then async (do t0 <- getCurrentTime let dot = dotDendrogram cfg "dendrogram" 1.0 dendro mnameMap highlightTrs -- Be wary of cycles, something appears to be buggy: putStrLn$ " [async] writing dendrogram as a graph to dendrogram.dot" -- writeFile (combine output_dir "dendrogram.dot") -- (show $ renderDot $ Gv.toDot dot) mtxt <- safePrintDendro dot case mtxt of Nothing -> putStrLn "WARNING: because we couldn't print it, we're not drawing the dendrogram either." Just txt -> do T.writeFile (combine output_dir "dendrogram.dot") txt putStrLn$ " [async] Next, plot dendrogram.pdf" _ <- dotToPDF dot (combine output_dir "dendrogram.pdf") t1 <- getCurrentTime putStrLn$ " [finished] Writing dendrogram diagram (" ++show(diffUTCTime t1 t0)++")") else async (return ()) case dist_thresh of Nothing -> do a <- plotIt Nothing return (M.empty,[],[a]) -- error "Fully hierarchical cluster output is not finished! Use --editdist." Just dstThresh -> do putStrLn$ "Combining all clusters at distance less than or equal to "++show dstThresh let clusts = sliceDendro (fromIntegral dstThresh) dendro classes = clustsToMap clusts -- Cache the lengths: wlens = [ (length ls, OneCluster ls) | OneCluster ls <- clusts ] sorted0 = reverse$ sortBy (compare `on` fst) wlens -- TODO: Parallel sorting? nameMap = clustsToNameMap (map snd sorted0) gvizAsync <- plotIt (Just nameMap) putStrLn$ "After flattening, cluster sizes are: "++show (map fst sorted0) -- Flatten out the dendogram: return (classes, sorted0, [gvizAsync]) unless (null binlist)$ reportClusts clust_mode binlist ---------------------------------------- -- TEST, TEMPTOGGLE: print out edge weights : -- forM_ (map snd3 results) $ \parsed -> do -- let weights = all_edge_weights (head$ S.toList taxa) parsed -- trace ("weights of "++ show parsed ++" "++ show weights) $ -- return () -- exitSuccess ---------------------------------------- -------------------------------------------------------------------------------- -- Finally, produce all the required outputs. -------------------------------------------------------------------------------- -- putStrLn$ "Final number of tree bins: "++ show (M.size classes) unless (null warnings1 && null warnings2) $ writeFile (combine output_dir "WARNINGS.txt") ("This file was generated to record all of the files which WERE NOT incorporated successfully into the results.\n" ++ "Each of these files had some kind of problem, likely one of the following:\n"++ " (1) a mismatched number of taxa (leaves) in the tree relative to the rest of the dataset\n"++ " (2) a file that could not be read.\n"++ " (3) a file that could not be parsed.\n\n"++ concat (map (\ file -> "Not a regular/readable file: "++ file++"\n") warnings1) ++ concat (map (\ (n,file) -> "Wrong number of taxa ("++ show n ++"): "++ file++"\n") warnings2)) async2 <- case clust_mode of BinThem -> outputBins binlist output_dir do_graph ClusterThem{} -> outputClusters expected_num_taxa binlist output_dir do_graph -- Wait on parallel tasks: putStrLn$ "Waiting for "++show (length$ async2:asyncs)++" asynchronous tasks to finish..." mapM_ wait (async2:asyncs) putStrLn$ "Phybin completed." -------------------------------------------------------------------------------- -- End driver -------------------------------------------------------------------------------- -- filePrefix = "bin" filePrefix = "cluster" -------------------------------------------------------------------------------- -- Driver helpers: -------------------------------------------------------------------------------- -- doBins :: [FullTree DefDecor] -> t1 -> t2 -> IO BinResults doBins :: [FullTree DefDecor] -> IO (BinResults StandardDecor) doBins validtrees = do putStrLn$ "Creating equivalence classes (bins)..." let classes = --binthem_normed$ zip files $ concat$ map snd3 results binthem validtrees return (classes) doCluster :: Bool -> Int -> C.Linkage -> [FullTree DefDecor] -> IO (DistanceMatrix, C.Dendrogram (FullTree DefDecor)) doCluster use_hashrf expected_num_taxa linkage validtrees = do t0 <- getCurrentTime when use_hashrf$ putStrLn " Using HashRF-style algorithm..." let nwtrees = map nwtree validtrees numtrees = length validtrees mat = if use_hashrf then hashRF expected_num_taxa nwtrees else fst (naiveDistMatrix nwtrees) ixtrees = zip [0..] validtrees dist (i,t1) (j,t2) | j == i = 0 -- | i == numtrees-1 = 0 | j < i = fromIntegral ((mat V.! i) U.! j) | otherwise = fromIntegral ((mat V.! j) U.! i) dist1 a b = trace ("Taking distance between "++show (fst a, fst b)) $ dist a b dendro = fmap snd $ C.dendrogram linkage ixtrees dist -- Force the distance matrix: V.mapM_ evaluate mat t1 <- getCurrentTime putStrLn$ "Time to compute distance matrix: "++show(diffUTCTime t1 t0) putStrLn$ "Clustering using method "++show linkage return (mat,dendro) reportClusts :: ClustMode -> [(Int, OneCluster StandardDecor)] -> IO () reportClusts mode binlist = do let binsizes = map fst binlist putStrLn$ " Outcome: "++show (length binlist)++" clusters found, "++show (length$ takeWhile (>1) binsizes) ++" non-singleton, top bin sizes: "++show(take 10 binsizes) putStrLn$" Up to first 30 bin sizes, excluding singletons:" forM_ (zip [1..30] binlist) $ \ (ind, (len, OneCluster ftrees)) -> do when (len > 1) $ -- Omit that long tail of single element classes... putStrLn$show$ hcat [text (" * cluster#"++show ind++", members "++ show len ++", "), case mode of BinThem -> vcat [text ("avg bootstraps "++show (get_bootstraps$ avg_trees$ map nwtree ftrees)++", "), text "all: " <> pPrint (filter (not . null) $ map (get_bootstraps . nwtree) ftrees)] ClusterThem{} -> hcat [] ] -- | Convert a flat list of clusters into a map from individual trees to clusters. clustsToMap :: [OneCluster StandardDecor] -> BinResults StandardDecor clustsToMap clusts = F.foldl' fn M.empty clusts where fn acc theclust@(OneCluster ftrs) = F.foldl' (fn2 theclust) acc ftrs fn2 theclust acc (FullTree{nwtree}) = M.insert (anonymize_annotated nwtree) theclust acc -- | Map each tree NAME onto the one-based index (in sorted order) of the cluster it -- comes from. clustsToNameMap :: [OneCluster StandardDecor] -> M.Map TreeName Int clustsToNameMap clusts = foldl' fn M.empty (zip [1..] clusts) where fn acc (ix,(OneCluster ftrs)) = foldl' (\acc' t -> M.insert (treename t) ix acc') acc ftrs flattenDendro :: (C.Dendrogram (FullTree DefDecor)) -> OneCluster StandardDecor flattenDendro dendro = case dendro of C.Leaf (FullTree{treename,labelTable,nwtree}) -> OneCluster [FullTree treename labelTable (annotateWLabLists nwtree)] C.Branch _ left right -> -- TODO: fix quadratic append flattenDendro left `appendClusts` flattenDendro right where appendClusts (OneCluster ls1) (OneCluster ls2) = OneCluster (ls1++ls2) -- | Turn a hierarchical clustering into a flat clustering. sliceDendro :: Double -> (C.Dendrogram (FullTree DefDecor)) -> [OneCluster StandardDecor] sliceDendro dstThresh den = loop den where loop br@(C.Branch dist left right) -- Too far apart to combine: | dist > dstThresh = loop left ++ loop right | otherwise = [flattenDendro br] loop br@(C.Leaf _) = [flattenDendro br] -------------------------------------------------------------------------------- -- outputClusters :: (Num a1, Ord a1, Show a1) => Int -> [(a1, t, OneCluster a)] -> String -> Bool -> IO () outputClusters :: Int -> [(Int, OneCluster a)] -> String -> Bool -> IO (Async ()) outputClusters num_taxa binlist output_dir do_graph = do let numbins = length binlist let base i size = combine output_dir (filePrefix ++ show i ++"_"++ show size) let consTrs = [ FullTree "consensus" (labelTable $ head ftrees) nwtr | (_, OneCluster ftrees) <- binlist , let nwtr :: NewickTree StandardDecor nwtr = annotateWLabLists $ fmap (const (Nothing,0)) $ consensusTree num_taxa $ map nwtree ftrees ] forM_ (zip3 [1::Int ..] binlist consTrs) $ \ (i, (size, OneCluster ftrees), fullConsTr) -> do writeFile (base i size ++".txt") (concat$ map ((++"\n") . treename) ftrees) writeFile (base i size ++"_consensus.tr") (show (displayStrippedTree fullConsTr) ++ "\n") writeFile (base i size ++"_alltrees.tr") (unlines [ show (displayStrippedTree ft) | ft <- ftrees ]) putStrLn$ " [finished] Wrote contents of each cluster to cluster_.txt" putStrLn$ " [finished] Wrote representative (consensus) trees to cluster__consensus.tr" if do_graph then do putStrLn$ "Next start the time consuming operation of writing out graphviz visualizations:" asyncs <- forM (zip3 [1::Int ..] binlist consTrs) $ \ (i, (size, OneCluster membs), fullConsTr) -> async$ do when (size > 1 || numbins < 100) $ do let dot = dotNewickTree ("cluster #"++ show i) 1.0 fullConsTr _ <- dotToPDF dot (base i size ++ ".pdf") return () async $ do mapM_ wait asyncs putStrLn$ " [finished] Wrote visual representations of consensus trees to "++filePrefix++"_.pdf" else do putStrLn$ "NOT creating processes to build per-cluster .pdf visualizations. (Not asked to.)" async (return ()) outputBins :: [(Int, OneCluster StandardDecor)] -> String -> Bool -> IO (Async ()) outputBins binlist output_dir do_graph = do let numbins = length binlist let base i size = combine output_dir (filePrefix ++ show i ++"_"++ show size) let avgs = map (avg_trees . map nwtree . clustMembers . snd) binlist forM_ (zip3 [1::Int ..] binlist avgs) $ \ (i, (size, OneCluster ftrees), avgTree) -> do let FullTree fstName labs _ = head ftrees fullAvgTr = FullTree fstName labs avgTree writeFile (base i size ++".txt") (concat$ map ((++"\n") . treename) ftrees) when debug$ do writeFile (base i size ++".dbg") (show (pPrint avgTree) ++ "\n") -- Printing the average tree instead of the stripped cannonical one: writeFile (base i size ++"_avg.tr") (show (displayDefaultTree$ deAnnotate fullAvgTr) ++ "\n") -- FIXME putStrLn$ " [finished] Wrote contents of each bin to "++filePrefix++"_.txt" putStrLn$ " Wrote representative trees to "++filePrefix++"__avg.tr" if do_graph then do putStrLn$ "Next start the time consuming operation of writing out graphviz visualizations:" asyncs <- forM (zip3 [1::Int ..] binlist avgs) $ \ (i, (size, OneCluster membs), avgTree) -> async $ do let FullTree fstName labs _ = head membs fullAvgTr = FullTree fstName labs avgTree when (size > 1 || numbins < 100) $ do let dot = dotNewickTree ("cluster #"++ show i) (1.0 / avg_branchlen (map nwtree membs)) --(annotateWLabLists$ fmap (const 0) tr) -- TEMP FIXME -- using just ONE representative tree: ( --trace ("WEIGHTED: "++ show (head$ trees bentry)) $ --(head$ trees bentry) fullAvgTr) _ <- dotToPDF dot (base i size ++ ".pdf") return () async $ do mapM_ wait asyncs putStrLn$ " [finished] Wrote visual representations of trees to "++filePrefix++"_.pdf" else async (return ()) -------------------------------------------------------------------------------- -- Monadic mapAccum mapAccumM :: Monad m => (acc -> x -> m (acc,y)) -> acc -> [x] -> m (acc,[y]) mapAccumM fn acc ls = F.foldrM fn' (acc,[]) ls where fn' x (acc,ls) = do (acc',y) <- fn acc x return (acc',y:ls) fst3 :: (t, t1, t2) -> t snd3 :: (t, t1, t2) -> t1 thd3 :: (t, t1, t2) -> t2 fst3 (a,_,_) = a snd3 (_,b,_) = b thd3 (_,_,c) = c -- Come up with an average tree from a list of isomorphic trees. -- This comes up with some blending of edge lengths. avg_trees :: [AnnotatedTree] -> AnnotatedTree avg_trees origls = fmap unlift $ foldIsomorphicTrees (foldl1 sumthem) $ map (fmap lift) origls where totalCount = fromIntegral$ length origls -- Here we do the actual averaging: unlift :: TempDecor -> StandardDecor unlift (bl, (bs, bootcount), wt, ls) = (StandardDecor (bl / totalCount) (case bootcount of 0 -> Nothing _ -> Just$ round (fromIntegral bs / fromIntegral bootcount)) wt ls) lift :: StandardDecor -> TempDecor lift (StandardDecor bl bs wt ls) = (bl, (fromMaybe 0 bs, countMayb bs), wt, ls) sumthem :: TempDecor -> TempDecor -> TempDecor sumthem (bl1, (bs1, cnt1), wt, ls) (bl2, (bs2, cnt2), _, _) = (bl1+bl2, (bs1 + bs2, cnt1+cnt2), wt, ls) countMayb Nothing = 0 countMayb (Just _) = 1 -- Used only by avg_trees above... type TempDecor = (Double, (Int, Int), Int, [Label]) avg ls = sum ls / fromIntegral (length ls) -- | Parse extra trees in addition to the main inputs (for --highlight). retrieveHighlights :: (String->String) -> LabelTable -> [FilePath] -> IO [[NewickTree ()]] retrieveHighlights name_hack labelTab ls = mapM parseHighlight ls where parseHighlight file = do bs <- B.readFile file let (lt2,htr) = parseNewick labelTab name_hack file bs unless (lt2 == labelTab) $ error$"Tree given as --highlight includes taxa not present in main tree set: "++ show(M.keys$ M.difference lt2 labelTab) return (map (fmap (const())) htr) -- | Create a predicate that tests trees for consistency with the set of --highlight -- (consensus) trees. -- -- Note, tree consistency is not the same as an exact match. It's -- like (<=) rather than (==). All trees are consistent with the -- "star topology". matchAnyHighlight :: [[NewickTree ()]] -> NewickTree () -> Bool -- matchAnyHighlight :: [[NewickTree ()]] -> NewickTree () -> Maybe Int -- If there is a match, return the index of the highlight that matched. matchAnyHighlight highlightTrs = let matchers = map mkMatcher highlightTrs mkMatcher ls = let fns = map compatibleWith ls -- Multiple predicate functions in \ tr -> -- Does it match any predicate? or$ map (\f -> f tr) fns in \ newtr -> any (\f -> f newtr) matchers PhyBin-0.3/Bio/Phylogeny/PhyBin/000077500000000000000000000000001221636131000164205ustar00rootroot00000000000000PhyBin-0.3/Bio/Phylogeny/PhyBin/Binning.hs000066400000000000000000000423761221636131000203540ustar00rootroot00000000000000{-# LANGUAGE ScopedTypeVariables, RecordWildCards, TypeSynonymInstances, CPP #-} {-# LANGUAGE NamedFieldPuns #-} {-# LANGUAGE OverloadedStrings #-} {-# OPTIONS_GHC -fwarn-incomplete-patterns #-} {-# OPTIONS_GHC -fwarn-unused-imports #-} -- | This module contains the code that does the tree normalization and binning. module Bio.Phylogeny.PhyBin.Binning ( -- * Binning and normalization binthem, normalize, normalizeFT, annotateWLabLists, deAnnotate, OneCluster(..), BinResults, StrippedTree, -- * Utilities and unit tests get_weight, unitTests, anonymize_annotated ) where import Data.Function (on) import Data.List (delete, minimumBy, sortBy, insertBy, intersperse, sort) import qualified Data.ByteString.Char8 as B import qualified Data.Map as M import Control.Applicative ((<$>),(<*>)) import Control.Concurrent (Chan) import Test.HUnit ((~:),(~=?),Test,test) -- For vizualization: import Text.PrettyPrint.HughesPJClass hiding (char, Style) import Bio.Phylogeny.PhyBin.CoreTypes import Bio.Phylogeny.PhyBin.Parser (parseNewicks, parseNewick) import Bio.Phylogeny.PhyBin.Visualize (dotNewickTree, viewNewickTree) -- Turn on for extra invariant checking: debug :: Bool debug = True ---------------------------------------------------------------------------------------------------- -- Normal form for unordered, unrooted trees ---------------------------------------------------------------------------------------------------- -- The basic idea is that what we *want* is the following, -- ROOT: most balanced point -- ORDER: sorted in increasing subtree weight -- But that's not quite good enough. There are ties to break. To do -- that we fall back on the (totally ordered) leaf labels. -------------------------------------------------------------------------------- -- Our sorting criteria for the children of interior nodes: compare_childtrees :: AnnotatedTree -> AnnotatedTree -> Ordering compare_childtrees node1 node2 = case (subtreeWeight $ get_dec node1) `compare` (subtreeWeight $ get_dec node2) of -- Comparisons on atoms cause problems WRT to determinism between runs if parallelism is introduced. -- Can consider it an optimization for the serial case perhaps: -- EQ -> case map deAtom (get_label_list node1) `compare` -- map deAtom (get_label_list node2) of EQ -> case (sortedLabels$ get_dec node1) `compare` (sortedLabels$ get_dec node2) of EQ -> error$ "Internal invariant broken. These two children have equal ordering priority:\n" ++ "Pretty printing:\n " ++ show (pPrint node1) ++ "\n " ++ show (pPrint node2) ++ "\nFull data structures:\n " ++ show (node1) ++ "\n " ++ show (node2) x -> x x -> x -- | A version lifted to operate over full trees. normalizeFT :: FullTree StandardDecor -> FullTree StandardDecor normalizeFT (FullTree nm labs tr) = FullTree nm labs (normalize tr) -- | This is it, here's the routine that transforms a tree into normal form. -- This relies HEAVILY on lazy evaluation. normalize :: AnnotatedTree -> AnnotatedTree normalize tree = snd$ loop tree Nothing where add_context dec Nothing = dec add_context dec (Just c) = add_weight dec c -- loop: Walk over the tree, turning it inside-out in the process. -- Inputs: -- 1. node: the NewickTree node to process ("us") -- 2. context: all nodes connected through the parent, "flipped" as though *we* were root -- The "flipped" part has ALREADY been normalized. -- Outputs: -- 1. new node -- 3. the best candidate root anywhere under this subtree loop :: AnnotatedTree -> Maybe (AnnotatedTree) -> (AnnotatedTree, AnnotatedTree) loop node ctxt = case node of NTLeaf (StandardDecor _ _ w sorted) _name -> (node, -- If the leaf becomes the root... we could introduce another node: NTInterior (add_context (StandardDecor 0 Nothing w sorted) ctxt) $ (verify_sorted "1" id$ maybeInsert compare_childtrees ctxt [node]) -- It may be reasonable to not support leaves becoming root.. that changes the number of nodes! --error "normalize: leaf becoming root not currently supported." ) NTInterior dec ls -> let -- If this node becomes the root, the parent becomes one of our children: inverted = NTInterior inverted_dec inverted_children inverted_dec = add_context dec ctxt inverted_children = verify_sorted "2" id$ maybeInsert compare_childtrees ctxt newchildren newchildren = --trace ("SORTED "++ show (map (get_label_list . fst) sorted)) $ map fst sorted sorted = sortBy (compare_childtrees `on` fst) possibs possibs = flip map ls $ \ child -> let -- Will this diverge??? Probably depends on how equality (for delete) is defined... -- Reconstruct the current node missing one child (because it became a parent): -- Update its metadata appropriately: newinverted = NTInterior (subtract_weight inverted_dec child) (verify_sorted "3" id$ delete newnode inverted_children) (newnode, _) = result result = loop child (Just newinverted) in result -- Either us or a candidate suggested by one of the children: rootcandidates = inverted : map snd sorted -- Who wins? The "most balanced". Minimize max subtree weight. -- The compare operator is NOT allowed to return EQ here. Therefore there will be a unique minima. winner = --trace ("Candidates: \n"++ show (nest 6$ vcat (map pPrint (zip (map max_subtree_weight rootcandidates) rootcandidates )))) $ minimumBy cmpr_subtree_weight rootcandidates max_subtree_weight = maximum . map get_weight . get_children fat_id = map get_label_list . get_children cmpr_subtree_weight tr1 tr2 = case max_subtree_weight tr1 `compare` max_subtree_weight tr2 of EQ -> -- As a fallback we compare the alphabetic order of the "bignames" of the children: case fat_id tr1 `compare` fat_id tr2 of EQ -> error$ "\nInternal invariant broken. These two were equally good roots:\n" ++ show (pPrint tr1) ++ "\n" ++ show (pPrint tr2) x -> x x -> x in (NTInterior dec newchildren, winner) -- Verify that our invariants are met: verify_sorted :: (Show a, Pretty a) => String -> (a -> AnnotatedTree) -> [a] -> [a] verify_sorted msg = if debug then \ project nodes -> let weights = map (get_weight . project) nodes in if sort weights == weights then nodes -- else error$ "Child list failed verification: "++ show (pPrint nodes) else error$ msg ++ ": Child list failed verification, not sorted: "++ show (weights) ++"\n "++ show (sep $ map pPrint nodes) ++ "\n\nFull output:\n " ++ (concat$ intersperse "\n " $ map show nodes) else \ _ nodes -> nodes -- TODO: Salvage any of these tests that are worthwhile and get them into the unit tests: tt :: AnnotatedTree tt = normalize $ annotateWLabLists $ head$ snd$ parseNewick M.empty id "" "(A,(C,D,E),B);" norm4 :: FullTree StandardDecor norm4 = norm "((C,D,E),B,A);" norm5 :: AnnotatedTree norm5 = normalize$ annotateWLabLists$ head$ snd$ parseNewick M.empty id "" "(D,E,C,(B,A));" ---------------------------------------------------------------------------------------------------- -- Equivalence classes on Trees: ---------------------------------------------------------------------------------------------------- -- | When binning, the members of a OneCluster are isomorphic trees. When clustering -- based on robinson-foulds distance they are merely similar trees. newtype OneCluster a = OneCluster { clustMembers :: [FullTree a] } deriving Show -- | Ignore metadata (but keep weights) for the purpose of binning type StrippedTree = NewickTree Int -- | Index the results of binning by topology-only stripped trees -- that have their decorations removed. type BinResults a = M.Map StrippedTree (OneCluster a) -- | The binning function. -- Takes labeled trees, classifies labels into equivalence classes. binthem :: [FullTree DefDecor] -> BinResults StandardDecor binthem ls = binthem_normed normalized where normalized = map (\ (FullTree n lab tree) -> FullTree n lab (normalize $ annotateWLabLists tree)) ls -- | This version accepts trees that are already normalized: binthem_normed :: [FullTree StandardDecor] -> BinResults StandardDecor binthem_normed normalized = -- foldl (\ acc (lab,tree) -> M.insertWith update tree (OneCluster{ members=[lab] }) acc) foldl (\ acc ft@(FullTree treename _ tree) -> M.insertWith update (anonymize_annotated tree) (OneCluster [ft]) acc) M.empty normalized --(map (mapSnd$ fmap (const ())) normalized) -- still need to STRIP them where -- update new old = OneCluster{ members= (members new ++ members old) } update (OneCluster new) (OneCluster old) = OneCluster (new++old) --strip = fmap (const ()) -- | For binning. Remove branch lengths and labels but leave weights. anonymize_annotated :: AnnotatedTree -> StrippedTree anonymize_annotated = fmap (\ (StandardDecor bl bs w labs) -> w) ---------------------------------------------------------------------------------------------------- -- Other tools and algorithms. ---------------------------------------------------------------------------------------------------- -- Extract all edges connected to a particular node in every tree. Return branch lengths. all_edge_weights lab trees = concat$ map (loop []) trees where loop acc (NTLeaf len name) | lab == name = len:acc loop acc (NTLeaf _ _) = acc loop acc (NTInterior _ ls) = foldl loop acc ls ---------------------------------------------------------------------------------------------------- -- Bitvector based normalization. ---------------------------------------------------------------------------------------------------- -- TODO: This approach is probably faster. Give it a try. {- int NumberOfSetBits(int i) { i = i - ((i >> 1) & 0x55555555); i = (i & 0x33333333) + ((i >> 2) & 0x33333333); return ((i + (i >> 4) & 0xF0F0F0F) * 0x1010101) >> 24; } int __builtin_popcount (unsigned int x); -} ---------------------------------------------------------------------------------------------------- {- ---------------------------------------- PARSING TIMING TEST: ---------------------------------------- Compiling this with GHC 6.12 on my laptop -O2... It takes 0.043s startup to parse ten files. And 0.316 seconds to parse 2648.. so we can say that's almost all time spent parsing/building/traversing. (All nodes summed to 14966) (The tested version uses Strings for labels... not Atoms) Comparing against the original mzscheme version (with Racket 5.0) with default optimization (there's no obvious -O2), well the generated .exe has a ~0.5 second startup time overhead... 0.881 seconds total to do the parsing, or about 380ms just for parsing. But that doesn't do the counting! Ok, this mzscheme version is in a messed up state at this point, but hacking it to do a count (and it gets a different one.. 12319), I get 0.882 seconds real time, that is neglibly more. If anything parsec should be at a disadvantage because of the lack of a preprocessing phase to generate the FSM... Btw, switching node labels over to Atoms made no difference. (But didn't slow down at least.) We wouldn't expect this to save anything on the construction side... parsec still allocates/builds the strings before we intern them. -} ---------------------------------------------------------------------------------------------------- -- General helper/utility functions: ---------------------------------------------------------------------------------------------------- merge :: Ord a => [a] -> [a] -> [a] merge [] ls = ls merge ls [] = ls merge l@(a:b) r@(x:y) = if a < x then a : merge b r else x : merge y l -- Set subtraction for sorted lists: demerge :: (Ord a, Show a) => [a] -> [a] -> [a] demerge ls [] = ls demerge [] ls = error$ "demerge: first list did not contain all of second, remaining: " ++ show ls demerge (a:b) r@(x:y) = case a `compare` x of EQ -> demerge b y LT -> a : demerge b r GT -> error$ "demerge: element was missing from first list: "++ show x -- maybeCons :: Maybe a -> [a] -> [a] -- maybeCons Nothing ls = ls -- maybeCons (Just x) ls = x : ls maybeInsert :: (a -> a -> Ordering) -> Maybe a -> [a] -> [a] maybeInsert _ Nothing ls = ls maybeInsert fn (Just x) ls = insertBy fn x ls -- | Add the metadata that is used for binning annotateWLabLists :: NewickTree DefDecor -> AnnotatedTree annotateWLabLists tr = case tr of NTLeaf (bs,bl) n -> NTLeaf (StandardDecor bl bs 1 [n]) n NTInterior (bs,bl) [] -> NTInterior (StandardDecor bl bs 0 []) [] -- NTInterior (bs,bl) [] -> error "annotateWLabLists: internal invariant broken. Shouldn't have NTInterior with null children." NTInterior (bs,bl) ls -> let children = map annotateWLabLists ls in NTInterior (StandardDecor bl bs (sum $ map (subtreeWeight . get_dec) children) (foldl1 merge $ map (sortedLabels . get_dec) children)) children ---------------------------------------------------------------------------------------------------- -- Simple Helper Functions ---------------------------------------------------------------------------------------------------- -- | Take the extra annotations away. Inverse of `annotateWLabLists`. deAnnotate :: FullTree StandardDecor -> FullTree DefDecor deAnnotate (FullTree a b tr) = FullTree a b (fmap (\ (StandardDecor bl bs _ _) -> (bs,bl)) tr) -- Number of LEAVES contained in subtree: get_weight :: AnnotatedTree -> Int get_weight = subtreeWeight . get_dec -- Sorted list of leaf labels contained in subtree get_label_list :: AnnotatedTree -> [Label] get_label_list = sortedLabels . get_dec add_weight :: StandardDecor -> AnnotatedTree -> StandardDecor add_weight (StandardDecor l1 bs1 w1 sorted1) node = let (StandardDecor _ bs2 w2 sorted2) = get_dec node in StandardDecor l1 ((+) <$> bs1 <*> bs2) (w1+w2) (merge sorted1 sorted2) -- Remove the influence of one subtree from the metadata of another. subtract_weight :: StandardDecor -> AnnotatedTree -> StandardDecor subtract_weight (StandardDecor l1 bs1 w1 sorted1) node = let (StandardDecor _ bs2 w2 sorted2) = get_dec node in StandardDecor l1 ((-) <$> bs1 <*> bs2) (w1-w2) (demerge sorted1 sorted2) ---------------------------------------------------------------------------------------------------- -- UNIT TESTING ---------------------------------------------------------------------------------------------------- tre1 :: (LabelTable, NewickTree DefDecor) tre1 = (x,head y) where (x,y) = parseNewick M.empty id "" "(A:0.1,B:0.2,(C:0.3,D:0.4):0.5);" tre1draw :: IO (Chan (), FullTree StandardDecor) tre1draw = viewNewickTree "tre1"$ (FullTree "" (fst tre1) (annotateWLabLists (snd tre1))) tre1dot :: IO () tre1dot = print $ dotNewickTree "" 1.0 $ (FullTree "" (fst tre1) (annotateWLabLists$ snd tre1)) norm :: String -> FullTree StandardDecor norm = norm2 . B.pack norm2 :: B.ByteString -> FullTree StandardDecor norm2 bstr = FullTree "" tbl (normalize $ annotateWLabLists$ head tr) where (tbl,tr) = parseNewick M.empty id "test" bstr unitTests :: Test unitTests = let ntl s = NTLeaf (Nothing,0.0) s (tbl,tre1_) = tre1 in test [ "merge" ~: [1,2,3,4,5,6] ~=? merge [1,3,5] [2,4,6::Int] , "demerge" ~: [2,4,6] ~=? demerge [1,2,3,4,5,6] [1,3,5::Int] , "demerge" ~: [1,3,5] ~=? demerge [1,2,3,4,5,6] [2,4,6::Int] , "annotateWLabLists" ~: ["A","B","C","D"] -- ORD on atoms is expensive... it must use the whole string. ~=? map (tbl M.!) (sortedLabels (get_dec (annotateWLabLists tre1_))) -- Make sure that all of these normalize to the same thing. , "normalize1" ~: "(C, D, E, (A, B))" ~=? show (displayDefaultTree $ deAnnotate$ norm "(A,(C,D,E),B);") , "normalize2A" ~: "(C, D, E, (A, B))" ~=? show (pPrint$ norm "((C,D,E),B,A);") , "normalize2B" ~: "(C, D, E, (A, B))" ~=? show (pPrint$ norm "(D,E,C,(B,A));") -- Here's an example from the rhizobia datasetsthat that caused my branch-sorting to fail. , "normalize3" ~: "(((BB, BJ)), (MB, ML), (RE, (SD, SM)))" ~=? show (pPrint$ norm2 (B.pack "(((ML,MB),(RE,(SD,SM))),(BB,BJ));")) -- "((BB: 2.691831, BJ: 1.179707): 0.000000, ((ML: 0.952401, MB: 1.020319): 0.000000, (RE: 2.031345, (SD: 0.180786, SM: 0.059988): 0.861187): 0.717913): 0.000000);" , "phbin: these 3 trees should fall in the same category" ~: 1 ~=? (length $ M.toList $ binthem $ snd $ parseNewicks id [("one", "(A,(C,D,E),B);"), ("two", "((C,D,E),B,A);"), ("three","(D,E,C,(B,A));")] -- [("one", snd$parseNewick M.empty id "" "(A,(C,D,E),B);"), -- ("two", snd$parseNewick M.empty id "" "((C,D,E),B,A);"), -- ("three", snd$parseNewick M.empty id "" "(D,E,C,(B,A));")] ) , "dotConversion" ~: True ~=? 100 < length (show $ dotNewickTree "" 1.0$ norm "(D,E,C,(B,A));") -- 444 ] -------------------------------------------------------------------------------- PhyBin-0.3/Bio/Phylogeny/PhyBin/CoreTypes.hs000066400000000000000000000323241221636131000206750ustar00rootroot00000000000000{-# LANGUAGE NamedFieldPuns, BangPatterns #-} {-# LANGUAGE OverloadedStrings, TypeSynonymInstances, FlexibleInstances #-} {-# LANGUAGE CPP #-} {-# OPTIONS_GHC -fwarn-incomplete-patterns #-} {-# OPTIONS_GHC -fwarn-unused-imports #-} module Bio.Phylogeny.PhyBin.CoreTypes ( -- * Tree and tree decoration types NewickTree(..), DefDecor, StandardDecor(..), AnnotatedTree, FullTree(..), ClustMode(..), TreeName, NumTaxa(..), -- * Tree operations displayDefaultTree, displayStrippedTree, treeSize, numLeaves, liftFT, get_dec, set_dec, get_children, map_labels, all_labels, foldIsomorphicTrees, -- * Utilities specific to StandardDecor: avg_branchlen, get_bootstraps, -- * Command line config options PhyBinConfig(..), default_phybin_config, WhichRFMode(..), -- * General helpers Label, LabelTable, -- * Experimenting with abstracting decoration operations HasBranchLen(..) ) where import qualified Data.Map as M import qualified Data.Set as S import Data.Foldable (Foldable(..)) import Data.Maybe (maybeToList) import Data.Monoid (mappend, mconcat) import Text.PrettyPrint.HughesPJClass hiding (char, Style) import qualified Data.Clustering.Hierarchical as C #define NO_ATOMS #ifndef NO_ATOMS import StringTable.Atom #endif ---------------------------------------------------------------------------------------------------- -- Type definitions ---------------------------------------------------------------------------------------------------- type BranchLen = Double -- | Even though the Newick format allows it, here we ignore interior node -- labels. (They are not commonly used.) -- -- Note that these trees are rooted. The `normalize` function ensures that a -- single, canonical rooted representation is chosen. data NewickTree a = NTLeaf a {-# UNPACK #-} !Label | NTInterior a [NewickTree a] deriving (Show, Eq, Ord) -- TODO: Ordering maybe shouldn't need to touch the metadata. At least on the fast -- path. {- -- [2010.09.22] Disabling: instance NFData Atom where rnf a = rnf (fromAtom a :: Int) instance NFData a => NFData (NewickTree a) where rnf (NTLeaf l n) = rnf (l,n) rnf (NTInterior l ls) = rnf (l,ls) -} instance Functor NewickTree where fmap fn (NTLeaf dec x) = NTLeaf (fn dec) x fmap fn (NTInterior dec ls) = NTInterior (fn dec) (map (fmap fn) ls) instance Foldable NewickTree where foldMap f (NTLeaf dec x) = f dec foldMap f (NTInterior dec ls) = mappend (f dec) $ mconcat (map (foldMap f) ls) instance Foldable FullTree where foldMap f (FullTree _ _ tr) = foldMap f tr instance Functor FullTree where fmap f (FullTree n l tr) = FullTree n l $ fmap f tr instance Pretty dec => Pretty (NewickTree dec) where -- I'm using displayDefaultTree for the "prettiest" printing and -- replacing pPrint with a whitespace-improved version of show: pPrint (NTLeaf dec name) = "NTLeaf" <+> pPrint dec <+> text (show name) pPrint (NTInterior dec ls) = "NTInterior" <+> pPrint dec <+> pPrint ls instance Pretty a => Pretty (FullTree a) where pPrint (FullTree name mp tr) = "FullTree " <+> text name <+> loop tr where loop (NTLeaf dec ind) = "NTLeaf" <+> pPrint dec <+> text (mp M.! ind) loop (NTInterior dec ls) = "NTInterior" <+> pPrint dec <+> pPrint ls instance (Pretty k, Pretty v) => Pretty (M.Map k v) where pPrint mp = pPrint (M.toList mp) -- | Display a tree WITH the bootstrap and branch lengths. -- This prints in NEWICK format. displayDefaultTree :: FullTree DefDecor -> Doc displayDefaultTree orig = loop tr <> ";" where (FullTree _ mp tr) = orig -- normalize orig loop (NTLeaf (Nothing,_) name) = text (mp M.! name) loop (NTLeaf _ _) = error "WEIRD -- why did a leaf node have a bootstrap value?" loop (NTInterior (bootstrap,_) ls) = case bootstrap of Nothing -> base Just val -> base <> text ":[" <> text (show val) <> text "]" where base = parens$ sep$ map_but_last (<>text",") $ map loop ls -- | The same, except with no bootstrap or branch lengths. Any tree annotations -- ignored. displayStrippedTree :: FullTree a -> Doc displayStrippedTree orig = loop tr <> ";" where (FullTree _ mp tr) = orig -- normalize orig loop (NTLeaf _ name) = text (mp M.! name) loop (NTInterior _ ls) = parens$ sep$ map_but_last (<>text",") $ map loop ls ---------------------------------------------------------------------------------------------------- -- Labels ---------------------------------------------------------------------------------------------------- -- | Labels are inexpensive unique integers. The table is necessary for converting them back. type Label = Int -- | Map labels back onto meaningful names. type LabelTable = M.Map Label String ---------------------------------------------------------------------------------------------------- -- Tree metadata (decorators) ---------------------------------------------------------------------------------------------------- -- | The barebones default decorator for NewickTrees contains BOOTSTRAP and -- BRANCHLENGTH. The bootstrap values, if present, will range in [0..100] type DefDecor = (Maybe Int, BranchLen) -- | Additionally includes some scratch data that is used by the binning algorithm. type AnnotatedTree = NewickTree StandardDecor -- | The standard decoration includes everything in `DefDecor` plus -- some extra cached data: -- -- (1) branch length from parent to "this" node -- (2) bootstrap values for the node -- -- (3) subtree weights for future use -- (defined as number of LEAVES, not counting intermediate nodes) -- (4) sorted lists of labels for symmetry breaking data StandardDecor = StandardDecor { branchLen :: BranchLen, bootStrap :: Maybe Int, -- The rest of these are used by the computations below. These are -- cached (memoized) values that could be recomputed: ---------------------------------------- subtreeWeight :: Int, sortedLabels :: [Label] } deriving (Show,Read,Eq,Ord) class HasBranchLen a where getBranchLen :: a -> BranchLen instance HasBranchLen StandardDecor where getBranchLen = branchLen -- This one is kind of sloppy: instance HasBranchLen DefDecor where getBranchLen = snd -- | A common type of tree contains the standard decorator and also a table for -- restoring the human-readable node names. data FullTree a = FullTree { treename :: TreeName , labelTable :: LabelTable , nwtree :: NewickTree a } deriving (Show, Ord, Eq) liftFT :: (NewickTree t -> NewickTree a) -> FullTree t -> FullTree a liftFT fn (FullTree nm labs x) = FullTree nm labs (fn x) type TreeName = String instance Pretty StandardDecor where pPrint (StandardDecor bl bs wt ls) = parens$ "StandardDecor" <+> hsep [pPrint bl, pPrint bs -- , pPrint wt, pPrint ls ] ---------------------------------------------------------------------------------------------------- -- * Configuring and running the command line tool. ---------------------------------------------------------------------------------------------------- -- | Due to the number of configuration options for the driver, we pack them into a record. data PhyBinConfig = PBC { verbose :: Bool , num_taxa :: NumTaxa , name_hack :: String -> String , output_dir :: String , inputs :: [String] , do_graph :: Bool , do_draw :: Bool , clust_mode :: ClustMode , highlights :: [FilePath] -- [FullTree ()] , show_trees_in_dendro :: Bool , show_interior_consensus :: Bool , rfmode :: WhichRFMode , preprune_labels :: Maybe [String] , print_rfmatrix :: Bool , dist_thresh :: Maybe Int , branch_collapse_thresh :: Maybe Double -- ^ Branches less than this length are collapsed. , bootstrap_collapse_thresh :: Maybe Int -- ^ BootStrap values less than this result in the intermediate node being collapsed. } -- | Supported modes for computing RFDistance. data WhichRFMode = HashRF | TolerantNaive deriving (Show, Eq, Ord) -- | How many taxa should we expect in the incoming dataset? data NumTaxa = Expected Int -- ^ Supplied by the user. Committed. | Unknown -- ^ In the future we may automatically pick a behavior. Now this one is usually an error. | Variable -- ^ Explicitly ignore this setting in favor of comparing all trees -- (even if some are missing taxa). This only works with certain modes. deriving (Show, Read, Eq) -- | The default phybin configuration. default_phybin_config :: PhyBinConfig default_phybin_config = PBC { verbose = False -- , num_taxa = error "must be able to determine the number of taxa expected in the dataset. (Supply it manually with -n.)" , num_taxa = Unknown , name_hack = id -- Default, no transformation of leaf-labels , output_dir = "./phybin_out/" , inputs = [] , do_graph = False , do_draw = False -- , clust_mode = BinThem , clust_mode = ClusterThem C.UPGMA , rfmode = HashRF , preprune_labels = Nothing , highlights = [] , show_trees_in_dendro = False , show_interior_consensus = False , print_rfmatrix = False , dist_thresh = Nothing , branch_collapse_thresh = Nothing , bootstrap_collapse_thresh = Nothing } data ClustMode = BinThem | ClusterThem { linkage :: C.Linkage } ---------------------------------------------------------------------------------------------------- -- * Simple utility functions for the core types: ---------------------------------------------------------------------------------------------------- -- | How many nodes (leaves and interior) are contained in a NewickTree? treeSize :: NewickTree a -> Int treeSize (NTLeaf _ _) = 1 treeSize (NTInterior _ ls) = 1 + sum (map treeSize ls) -- | This counts only leaf nodes, which should include all taxa. numLeaves :: NewickTree a -> Int numLeaves (NTLeaf _ _) = 1 numLeaves (NTInterior _ ls) = sum (map numLeaves ls) map_but_last :: (a -> a) -> [a] -> [a] map_but_last _ [] = [] map_but_last _ [h] = [h] map_but_last fn (h:t) = fn h : map_but_last fn t get_dec :: NewickTree t -> t get_dec (NTLeaf dec _) = dec get_dec (NTInterior dec _) = dec -- Set all the decorations to a constant: set_dec :: b -> NewickTree a -> NewickTree b set_dec d = fmap (const d) --set_dec d (NTLeaf _ x) = NTLeaf d x --set_dec d (NTInterior _ ls) = NTInterior d $ map (set_dec d) ls get_children :: NewickTree t -> [NewickTree t] get_children (NTLeaf _ _) = [] get_children (NTInterior _ ls) = ls -- | Average branch length across all branches in all all trees. avg_branchlen :: HasBranchLen a => [NewickTree a] -> Double avg_branchlen origls = fst total / snd total where total = sum_ls $ map sum_tree origls sum_ls ls = (sum$ map fst ls, sum$ map snd ls) sum_tree (NTLeaf dec _) | getBranchLen dec == 0 = (0,0) | otherwise = (abs (getBranchLen dec),1) sum_tree (NTInterior dec ls) = let branchLen = getBranchLen dec (x,y) = sum_ls$ map sum_tree ls in if branchLen == 0 then (x, y) else ((abs branchLen) + x, 1+y) -- | Retrieve all the bootstraps values actually present in a tree. get_bootstraps :: NewickTree StandardDecor -> [Int] get_bootstraps (NTLeaf (StandardDecor{bootStrap}) _) = maybeToList bootStrap get_bootstraps (NTInterior (StandardDecor{bootStrap}) ls) = maybeToList bootStrap ++ concatMap get_bootstraps ls -- | Apply a function to all the *labels* (leaf names) in a tree. map_labels :: (Label -> Label) -> NewickTree a -> NewickTree a map_labels fn (NTLeaf dec lbl) = NTLeaf dec $ fn lbl map_labels fn (NTInterior dec ls) = NTInterior dec$ map (map_labels fn) ls -- | Return all the labels contained in the tree. all_labels :: NewickTree t -> [Label] all_labels (NTLeaf _ lbl) = [lbl] all_labels (NTInterior _ ls) = concat$ map all_labels ls -- | This function allows one to collapse multiple trees while looking -- only at the "horizontal slice" of all the annotations *at a given -- position* in the tree. -- -- "Isomorphic" must apply both to the shape and the name labels or it -- is an error to apply this function. foldIsomorphicTrees :: ([a] -> b) -> [NewickTree a] -> NewickTree b foldIsomorphicTrees _ [] = error "foldIsomorphicTrees: empty list of input trees" foldIsomorphicTrees fn ls@(hd:_) = fmap fn horiztrees where -- Preserve the input order: horiztrees = Prelude.foldr consTrees (fmap (const []) hd) ls -- We use the tree datatype itself as the intermediate data -- structure. This is VERY allocation-expensive, it would be -- possible to trade compute for allocation here: consTrees a b = case (a,b) of (NTLeaf dec nm1, NTLeaf decls nm2) | nm1 /= nm2 -> error$"foldIsomorphicTrees: mismatched names: "++show (nm1,nm2) | otherwise -> NTLeaf (dec : decls) nm1 (NTInterior dec ls1, NTInterior decls ls2) -> NTInterior (dec:decls) $ zipWith consTrees ls1 ls2 _ -> error "foldIsomorphicTrees: difference in tree shapes" PhyBin-0.3/Bio/Phylogeny/PhyBin/Parser.hs000066400000000000000000000236161221636131000202200ustar00rootroot00000000000000{-# LANGUAGE ScopedTypeVariables, BangPatterns, ParallelListComp #-} {-# OPTIONS_GHC -fwarn-incomplete-patterns #-} {-# OPTIONS_GHC -fwarn-unused-imports #-} module Bio.Phylogeny.PhyBin.Parser (newick_parser, parseNewick, parseNewicks, parseNewickFiles, unitTests) where import Control.Exception (evaluate, handle, SomeException) import qualified Data.ByteString.Char8 as B import Data.Char (isSpace) import Data.Map as M import Data.List as L import Text.Parsec import Text.Parsec.ByteString import Test.HUnit ((~:),(~=?),Test,test,assertFailure) import System.FilePath (takeBaseName) import Bio.Phylogeny.PhyBin.CoreTypes import Prelude as P type NameHack = (String->String) -- | Parse a bytestring into a NewickTree with branch lengths. The -- first argument is file from which the data came and is just for -- better error messages. -- -- If the single bytestring contains more than one tree, then a number is appended to -- the tree names. parseNewick :: LabelTable -> NameHack -> String -> B.ByteString -> (LabelTable, [NewickTree DefDecor]) parseNewick tbl0 name_hack file input = (lbls,trs) where (lbls,trs) = extractLabelTable tbl0 $ runB file (many1$ newick_parser name_hack) $ B.filter (not . isSpace) input -- treeFiles <- acquireTreeFiles files -- let fn f = do raw <- B.readFile f -- let ls = map (`B.append` (B.pack ";")) $ -- B.splitWith (== ';') raw -- return (map (f,) ls) -- trees0 <- concat <$> mapM fn treeFiles -- FIXME: no name_hack here: -- let (lbls, trees) = parseNewicks id trees0 -- putStrLn$ "Read trees! "++show (length trees) -- putStrLn$ "Taxa: "++show (pPrint lbls) -- putStrLn$ "First tree: "++show (displayDefaultTree (head trees)) -- | Parse a list of trees, starting with an empty map of labels and accumulating a final map. parseNewickFiles :: NameHack -> [String] -> IO (LabelTable, [FullTree DefDecor]) parseNewickFiles fn nms = do -- WARNING: Lazy-IO here. We need to be careful about leaking file descriptors. bss <- mapM B.readFile nms return $ parseNewicks fn (zip nms bss) -- | A version which takes in-memory trees as ByteStrings. parseNewicks :: NameHack -> [(String,B.ByteString)] -> (LabelTable, [FullTree DefDecor]) parseNewicks name_hack pairs = (labtbl, fullTrs) where fullTrs = [ FullTree (tweakName file ind) labtbl tr | (file,tr) <- trs | ind <- [(0::Int)..] ] tweakName file ind = -- Here we do the renaming/numbering business: if addSuffix then (takeBaseName file)++"_"++show ind else (takeBaseName file) addSuffix = case trs of [] -> False -- Should this be an error? [_] -> False _ -> True (labtbl, trs) = P.foldr fn (M.empty,[]) pairs fn (file,bstr) (!acc,!ls) = let (acc',trs) = parseNewick acc name_hack file bstr names = if length trs > 1 then zipWith (\x y -> x++"_"++show y) (repeat file) [0..] else [file] in (acc', (zip names trs) ++ ls) runB :: Show a => String -> Parser a -> B.ByteString -> a runB file p input = case (parse p "" input) of Left err -> error ("parse error in file "++ show file ++" at "++ show err) Right x -> x extractLabelTable :: LabelTable -> [TempTree] -> (LabelTable, [NewickTree DefDecor]) extractLabelTable tbl0 trs = (finMap,finTree) where flipped = M.fromList $ L.map (\(x,y)->(y,x)) $ M.toList tbl0 -- (_,finMap,finTree) = loop2 (S.fromList (M.elems tbl0)) tbl0 tr (_,finMap,finTree) = loop1 flipped tbl0 trs -- Mere plumbing: loop1 seen acc [] = (seen, acc, []) loop1 seen acc (hd:tl) = let (seen2,acc2,hd') = loop2 seen acc hd (seen3,acc3,tl') = loop1 seen2 acc2 tl in (seen3,acc3, hd':tl') loop2 seen acc (NTLeaf (d,Just nm) _) | M.member nm seen = (seen, acc, NTLeaf d (seen M.! nm)) | otherwise = let nxt = M.size acc in (M.insert nm nxt seen, M.insert nxt nm acc, NTLeaf d nxt) loop2 seen1 acc1 (NTInterior (d,Nothing) chlds) = let (seen',acc',ls') = P.foldr (\ x (seen2,acc2,ls) -> let (seen3,acc3,x') = loop2 seen2 acc2 x in (seen3, acc3, x':ls)) (seen1,acc1,[]) chlds in (seen',acc', NTInterior d ls') ---------------------------------------------------------------------------------------------------- -- Newick file format parser definitions: ---------------------------------------------------------------------------------------------------- -- | Hack: we store the names in the leaves. type TempTree = NewickTree (DefDecor,Maybe String) -- | This is used to post-facto splice metadata into the data structure. tag :: DefDecor -> TempTree -> TempTree tag l s = case s of NTLeaf (_,m) n -> NTLeaf (l,m) n NTInterior (_,Nothing) ls -> NTInterior (l,Nothing) ls -- | This parser ASSUMES that whitespace has been prefiltered from the input. newick_parser :: NameHack -> Parser TempTree newick_parser name_hack = do x <- subtree name_hack -- Get the top-level metadata: l <- branchMetadat name_hack _ <- char ';' return$ tag l x subtree :: NameHack -> Parser TempTree subtree name_hack = internal name_hack <|> leaf name_hack defaultMeta :: (Maybe Int, Double) defaultMeta = (Nothing,0.0) leaf :: NameHack -> Parser TempTree leaf name_hack = do nm <- name let nm' = name_hack nm return$ NTLeaf (defaultMeta,Just nm') 0 internal :: NameHack -> Parser TempTree internal name_hack = do _ <- char '(' bs <- branchset name_hack _ <- char ')' _nm <- name -- IGNORED, internal names. return$ NTInterior (defaultMeta,Nothing) bs branchset :: NameHack -> Parser [TempTree] branchset name_hack = do b <- branch name_hack "at least one branch" rest <- option [] $ try$ do char ','; branchset name_hack return (b:rest) branch :: NameHack -> Parser TempTree branch name_hack = do s <- subtree name_hack l <- branchMetadat name_hack return$ tag l s -- If the length is omitted, it is implicitly zero. branchMetadat :: NameHack -> Parser DefDecor branchMetadat name_hack = option defaultMeta $ do char ':' n <- (try sciNotation <|> number) -- IF the branch length succeeds then we go for the bracketed bootstrap value also: bootstrap <- option Nothing $ do char '[' s <- many1 digit char ']' return (Just (read s)) return (bootstrap,n) -- | Parse a normal, decimal number. number :: Parser Double number = do sign <- option "" $ string "-" first <- many1 digit second <- option "0" $ try$ do char '.'; many1 digit return (read (sign ++ first++"."++second) :: Double) -- | Parse a number in scientific notation. sciNotation :: Parser Double sciNotation = do coeff <- do first <- many1 digit second <- option "0" $ try$ do char '.'; many1 digit return $ first++"."++second char 'e' sign <- option "" $ string "-" expon <- many1 digit return (read (coeff++"e"++sign++expon)) -- | Names are a mess... they contain all kinds of garbage sometimes it seems. -- Thus we are very permissive. We allow anything that is not something we specifically need to reserve. name :: Parser String name = option "" $ many1 (noneOf "()[]:;, \t\n\r\f\v") -- name = option "" $ many1 (letter <|> digit <|> oneOf "_.-") -------------------------------------------------------------------------------- -- Unit Tests -------------------------------------------------------------------------------- tre1 :: NewickTree DefDecor tre1 = head $ snd $ parseNewick M.empty id "" $ B.pack "(A:0.1,B:0.2,(C:0.3,D:0.4):0.5);" unitTests :: Test unitTests = test [ "test name" ~: "foo" ~=? run name "foo" , "test number" ~: 3.3 ~=? run number "3.3" , "test number" ~: 3.0 ~=? run number "3" , "test number" ~: -3.0 ~=? run number "-3" , "leaf" ~: ntl "A" ~=? run (leaf id) "A" , "subtree" ~: ntl "A" ~=? run (subtree id) "A" -- These are not allowed: , "null branchset" ~: errortest$ run (branchset id) "" , "internal" ~: NTInterior nada [ntl "A"] ~=? run (internal id) "(A);" , "example: no nodes are named" ~: NTInterior nada [ntl "", ntl "", NTInterior nada [ntl "", ntl ""]] ~=? run (newick_parser id) "(,,(,));" , "example: leaf nodes are named" ~: 6 ~=? treeSize (run (newick_parser id) "(A,B,(C,D));") , "example: all nodes are named" ~: 6 ~=? treeSize (run (newick_parser id) "(A,B,(C,D)E)F;") , "example: all but root node have a distance to parent" ~: 6 ~=? treeSize (run (newick_parser id) "(:0.1,:0.2,(:0.3,:0.4):0.5);") , "example: all have a distance to parent" ~: 6 ~=? treeSize (run (newick_parser id) "(:0.1,:0.2,(:0.3,:0.4):0.5):0.6;") , "example: distances and leaf names (popular)" ~: 6 ~=? treeSize tre1 , "example: distances and all names" ~: 6 ~=? treeSize (run (newick_parser id) "(A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;") , "example: a tree rooted on a leaf node (rare)" ~: 6 ~=? treeSize (run (newick_parser id) "((B:0.2,(C:0.3,D:0.4)E:0.5)F:0.1)A;") ] where ntl s = NTLeaf ((Nothing,0.0),Just s) 0 nada = ((Nothing,0),Nothing) run :: Show a => Parser a -> String -> a run p input = runB "" p (B.pack input) errortest :: t -> IO () errortest x = --() ~=? handle (\ (e::SomeException) -> return ()) $ do evaluate x assertFailure "test was expected to throw an error" PhyBin-0.3/Bio/Phylogeny/PhyBin/PreProcessor.hs000066400000000000000000000077131221636131000214120ustar00rootroot00000000000000{-# LANGUAGE ScopedTypeVariables #-} -- | This is a preprocessor which can (optionally) be invoked to -- collapse short branch lengths. module Bio.Phylogeny.PhyBin.PreProcessor ( collapseBranches, collapseBranchLenThresh, collapseBranchBootStrapThresh, pruneTreeLeaves ) where import qualified Data.Set as S import Data.Maybe (catMaybes) import Bio.Phylogeny.PhyBin.CoreTypes -- | Prune the leaves of the tree to only those leaves in the provided set. -- -- If ALL leaves are pruned from the set, this function returns nothing. pruneTreeLeaves :: S.Set Label -> NewickTree DefDecor -> Maybe (NewickTree DefDecor) pruneTreeLeaves set tr = loop tr where loop orig@(NTLeaf _ lab) | S.member lab set = Just orig | otherwise = Nothing loop (NTInterior dec@(_,blen) ls) = case catMaybes $ map loop ls of [] -> Nothing -- Here we ELIMINATE the intermediate node, but we add in its branch length: [one] -> Just$ fmap (\(bs,bl) -> (bs,blen + bl)) one ls' -> Just (NTInterior dec ls') -- | Removes branches that do not meet a predicate, leaving a shallower, "bushier" -- tree. This does NOT change the set of leaves (taxa), it only removes interior -- nodes. -- -- `collapseBranches pred collapser tree` uses `pred` to test the meta-data to see -- if collapsing the intermediate node below the branch is necessary, and if it is, -- it uses `collapser` to reduce all the metadata for the collapsed branches into a -- single piece of metadata. collapseBranches :: forall a . (a -> Bool) -> (a -> a -> a) -> NewickTree a -> NewickTree a collapseBranches isCollapsable collapse origtr = final where (_,_, final) = loop origtr -- This loop returns: -- (1) a list of leaf "floaters" that can still move upwards, -- (2) immovable subtrees that can't -- (3) a final node IF the result is the root. loop :: NewickTree a -> ([(a,Label)], [NewickTree a], NewickTree a) loop lf@(NTLeaf dec lb) | isCollapsable dec = ([(dec,lb)], [], lf) | otherwise = ([], [lf], lf) loop (NTInterior dec children) = let (floats, anchors, _) = unzip3 $ map loop children thenode = NTInterior dec $ concat anchors ++ map (uncurry NTLeaf) (concat floats) in if isCollapsable dec then -- If we are collapsable we keep floating upwards. -- We combine our metadata (on the left) into the floatees: (map (\ (a,l) -> (collapse dec a, l)) (concat floats), concat anchors, thenode) else -- Otherwise this is the end of the road for these floaters: ([], [thenode], thenode) -- | A common configuration. Collapse branches based on a length threshold. collapseBranchLenThresh :: Double -> NewickTree DefDecor -> NewickTree DefDecor -- collapseBranchLenThresh :: HasBranchLen a => Double -> NewickTree a -> NewickTree a collapseBranchLenThresh thr tr = collapseBranches ((< thr) . getBranchLen) collapser tr where -- We REMOVE BootStraps as part of this process, they are not well-defined after this point. collapser _intermediate@(_bt1, len1) _floatee@(_bt2, len2) = (Nothing, len1 + len2) -- | A common configuration. Collapse branches based on bootstrap values. collapseBranchBootStrapThresh :: Int -> NewickTree DefDecor -> NewickTree DefDecor -- collapseBranchLenThresh :: HasBranchLen a => Double -> NewickTree a -> NewickTree a collapseBranchBootStrapThresh thr tr = collapseBranches ((< thr) . getBoot) collapser tr where getBoot (Nothing,_) = error$"collapseBranchBootStrapThresh: bootstrap value missing on tree node!\n"++ "They must be present if --minbootstrap is used." getBoot (Just boot,_) = boot -- This had better happen BEFORE branch-length based collapsing is done: collapser (_,len1) (_,len2) = (Nothing, len1+len2) PhyBin-0.3/Bio/Phylogeny/PhyBin/RFDistance.hs000066400000000000000000000430671221636131000207500ustar00rootroot00000000000000{-# LANGUAGE ScopedTypeVariables, CPP, BangPatterns #-} module Bio.Phylogeny.PhyBin.RFDistance ( -- * Types DenseLabelSet, DistanceMatrix, -- * Bipartition (Bip) utilities allBips, foldBips, dispBip, consensusTree, bipsToTree, filterCompatible, compatibleWith, -- * ADT for dense sets mkSingleDense, mkEmptyDense, bipSize, denseUnions, denseDiff, invertDense, markLabel, -- * Methods for computing distance matrices naiveDistMatrix, hashRF, -- * Output printDistMat) where import Control.Monad import Control.Monad.ST import Data.Function (on) import Data.Word import qualified Data.Vector as V import qualified Data.Vector.Mutable as MV import qualified Data.Vector.Unboxed.Mutable as MU import qualified Data.Vector.Unboxed as U import Text.PrettyPrint.HughesPJClass hiding (char, Style) import System.IO (hPutStrLn, hPutStr, Handle) import System.IO.Unsafe -- import Control.LVish -- import qualified Data.LVar.Set as IS -- import qualified Data.LVar.SLSet as SL -- import Data.LVar.Map as IM -- import Data.LVar.NatArray as NA import Bio.Phylogeny.PhyBin.CoreTypes import Bio.Phylogeny.PhyBin.PreProcessor (pruneTreeLeaves) -- import Data.BitList import qualified Data.Set as S import qualified Data.List as L import qualified Data.IntSet as SI import qualified Data.Map.Strict as M import qualified Data.Foldable as F import qualified Data.Traversable as T import Data.Monoid import Prelude as P import Debug.Trace #ifdef BITVEC_BIPS import qualified Data.Vector.Unboxed.Bit as UB import qualified Data.Bit as B #endif -- I don't understand WHY, but I seem to get the same answers WITHOUT this. -- Normalization and symmetric difference do make things somewhat slower (e.g. 1.8 -- seconds vs. 2.2 seconds for 150 taxa / 100 trees) #define NORMALIZATION -- define BITVEC_BIPS -------------------------------------------------------------------------------- -- A data structure choice -------------------------------------------------------------------------------- -- type DenseLabelSet s = BitList -- | Dense sets of taxa, aka Bipartitions or BiPs -- We assume that taxa labels have been mapped onto a dense, contiguous range of integers [0,N). -- -- NORMALIZATION Rule: Bipartitions are really two disjoint sets. But as long as -- the parent set (the union of the partitions, aka "all taxa") then a bipartition -- can be represented just by *one* subset. Yet we must choose WHICH subset for -- consistency. We use the rule that we always choose the SMALLER. Thus the -- DenseLabelSet should always be half the size or less, compared to the total -- number of taxa. -- -- A set that is more than a majority of the taxa can be normalized by "flipping", -- i.e. taking the taxa that are NOT in that set. #ifdef BITVEC_BIPS # if 1 type DenseLabelSet = UB.Vector B.Bit markLabel lab = UB.modify (\vec -> MU.write vec lab (B.fromBool True)) mkEmptyDense size = UB.replicate size (B.fromBool False) mkSingleDense size ind = markLabel ind (mkEmptyDense size) denseUnions = UB.unions bipSize = UB.countBits denseDiff = UB.difference invertDense size bip = UB.invert bip dispBip labs bip = show$ map (\(ix,_) -> (labs M.! ix)) $ filter (\(_,bit) -> B.toBool bit) $ zip [0..] (UB.toList bip) denseIsSubset a b = UB.or (UB.difference b a) traverseDense_ fn bip = U.ifoldr' (\ ix bit acc -> (if B.toBool bit then fn ix else return ()) >> acc) (return ()) bip # else -- TODO: try tracking the size: data DenseLabelSet = DLS {-# UNPACK #-} !Int (UB.Vector B.Bit) markLabel lab (DLS _ vec)= DLS (UB.modify (\vec -> return (MU.write vec lab (B.fromBool True))) ) vec -- .... # endif #else type DenseLabelSet = SI.IntSet markLabel lab set = SI.insert lab set mkEmptyDense _size = SI.empty mkSingleDense _size = SI.singleton denseUnions _size = SI.unions bipSize = SI.size denseDiff = SI.difference denseIsSubset = SI.isSubsetOf dispBip labs bip = "[" ++ unwords strs ++ "]" where strs = map (labs M.!) $ SI.toList bip invertDense size bip = loop SI.empty (size-1) where -- There's nothing for it but to iterate and test for membership: loop !acc ix | ix < 0 = acc | SI.member ix bip = loop acc (ix-1) | otherwise = loop (SI.insert ix acc) (ix-1) traverseDense_ fn bip = -- FIXME: need guaranteed non-allocating way to do this. SI.foldr' (\ix acc -> fn ix >> acc) (return ()) bip #endif markLabel :: Label -> DenseLabelSet -> DenseLabelSet mkEmptyDense :: Int -> DenseLabelSet mkSingleDense :: Int -> Label -> DenseLabelSet denseUnions :: Int -> [DenseLabelSet] -> DenseLabelSet bipSize :: DenseLabelSet -> Int -- | Print a BiPartition in a pretty form dispBip :: LabelTable -> DenseLabelSet -> String -- | Assume that total taxa are 0..N-1, invert membership: invertDense :: Int -> DenseLabelSet -> DenseLabelSet traverseDense_ :: Monad m => (Int -> m ()) -> DenseLabelSet -> m () -------------------------------------------------------------------------------- -- Dirt-simple reference implementation -------------------------------------------------------------------------------- type DistanceMatrix = V.Vector (U.Vector Int) -- | Returns a triangular distance matrix encoded as a vector. -- Also return the set-of-BIPs representation for each tree. -- -- This uses a naive method, directly computing the pairwise -- distance between each pair of trees. -- -- This method is TOLERANT of differences in the laba/taxa sets between two trees. -- It simply prunes to the intersection before doing the distance comparison. -- Other scoring methods may be added in the future. (For example, penalizing for -- missing taxa.) naiveDistMatrix :: [NewickTree DefDecor] -> (DistanceMatrix, V.Vector (S.Set DenseLabelSet)) naiveDistMatrix lst = let sz = P.length lst treeVect = V.fromList lst labelSets = V.map treeLabels treeVect eachbips = V.map allBips treeVect mat = V.generate sz $ \ i -> U.generate i $ \ j -> let inI = (labelSets V.! i) inJ = (labelSets V.! j) inBoth = S.intersection inI inJ -- Match will always succeed due to size==0 test below: Just prI = pruneTreeLeaves inBoth (treeVect V.! i) Just prJ = pruneTreeLeaves inBoth (treeVect V.! j) -- Memoization: If we are using it at its full size we can use the cached one: bipsI = if S.size inBoth == S.size inI then (eachbips V.! i) else allBips prI bipsJ = if S.size inBoth == S.size inJ then (eachbips V.! j) else allBips prJ diff1 = S.size (S.difference bipsI bipsJ) diff2 = S.size (S.difference bipsJ bipsI) -- symettric difference in if S.size inBoth == 0 then 0 -- This is weird, but what other answer could we give? else diff1 + diff2 in (mat, eachbips) where treeLabels :: NewickTree a -> S.Set Label treeLabels (NTLeaf _ lab) = S.singleton lab treeLabels (NTInterior _ ls) = S.unions (map treeLabels ls) -- | The number of bipartitions implied by a tree is one per EDGE in the tree. Thus -- each interior node carries a list of BiPs the same length as its list of children. labelBips :: NewickTree a -> NewickTree (a, [DenseLabelSet]) labelBips tr = -- trace ("labelbips "++show allLeaves++" "++show size) $ #ifdef NORMALIZATION fmap (\(a,ls) -> (a,map (normBip size) ls)) $ #endif loop tr where size = numLeaves tr zero = mkEmptyDense size loop (NTLeaf dec lab) = NTLeaf (dec, [markLabel lab zero]) lab loop (NTInterior dec chlds) = let chlds' = map loop chlds sets = map (denseUnions size . snd . get_dec) chlds' in NTInterior (dec, sets) chlds' allLeaves = leafSet tr leafSet (NTLeaf _ lab) = mkSingleDense size lab leafSet (NTInterior _ ls) = denseUnions size $ map leafSet ls -- normBip :: DenseLabelSet -> DenseLabelSet -> DenseLabelSet -- normBip allLeaves bip = normBip :: Int -> DenseLabelSet -> DenseLabelSet normBip totsize bip = let -- size = bipSize allLeaves halfSize = totsize `quot` 2 -- flipped = denseDiff allLeaves bip flipped = invertDense totsize bip in case compare (bipSize bip) halfSize of LT -> bip GT -> flipped -- Flip it EQ -> min bip flipped -- This is a painful case, we need a tie-breaker foldBips :: Monoid m => (DenseLabelSet -> m) -> NewickTree a -> m foldBips f tr = F.foldMap f' (labelBips tr) where f' (_,bips) = F.foldMap f bips -- | Get all non-singleton BiPs implied by a tree. allBips :: NewickTree a -> S.Set DenseLabelSet allBips tr = S.filter ((> 1) . bipSize) $ foldBips S.insert tr S.empty -------------------------------------------------------------------------------- -- Optimized, LVish version -------------------------------------------------------------------------------- -- First, necessary types: -- UNFINISHED: #if 0 -- | A collection of all observed bipartitons (bips) with a mapping of which trees -- contain which Bips. type BipTable s = IMap DenseLabelSet s (SparseTreeSet s) -- type BipTable = IMap BitList (U.Vector Bool) -- type BipTable s = IMap BitList s (NA.NatArray s Word8) -- | Sets of taxa (BiPs) that are expected to be sparse. type SparseTreeSet s = IS.ISet s TreeID -- TODO: make this a set of numeric tree IDs... -- NA.NatArray s Word8 type TreeID = AnnotatedTree -- | Tree's are identified simply by their order within the list of input trees. -- type TreeID = Int #endif -------------------------------------------------------------------------------- -- Alternate way of slicing the problem: HashRF -------------------------------------------------------------------------------- -- The distance matrix is an atomically-bumped matrix of numbers. -- type DistanceMat s = NA.NatArray s Word32 -- Except... bump isn't supported by our idempotent impl. -- | This version slices the problem a different way. A single pass over the trees -- populates the table of bipartitions. Then the table can be processed (locally) to -- produce (non-localized) increments to a distance matrix. hashRF :: Int -> [NewickTree a] -> DistanceMatrix hashRF num_taxa trees = build M.empty (zip [0..] trees) where num_trees = length trees -- First build the table: build acc [] = ingest acc build acc ((ix,hd):tl) = let bips = allBips hd acc' = S.foldl' fn acc bips fn acc bip = M.alter fn2 bip acc fn2 (Just membs) = Just (markLabel ix membs) fn2 Nothing = Just (mkSingleDense num_taxa ix) in build acc' tl -- Second, ingest the table to construct the distance matrix: ingest :: M.Map DenseLabelSet DenseLabelSet -> DistanceMatrix ingest bipTable = runST theST where theST :: forall s0 . ST s0 DistanceMatrix theST = do -- Triangular matrix, starting narrow and widening: matr <- MV.new num_trees -- Too bad MV.replicateM is insufficient. It should pass index. -- Instead we write this C-style: for_ (0,num_trees) $ \ ix -> do row <- MU.replicate ix (0::Int) MV.write matr ix row return () unsafeIOToST$ putStrLn$" Built matrix for dim "++show num_trees let bumpMatr i j | j < i = incr i j | otherwise = incr j i incr :: Int -> Int -> ST s0 () incr i j = do -- Not concurrency safe yet: -- unsafeIOToST$ putStrLn$" Reading at position "++show(i,j) row <- MV.read matr i elm <- MU.read row j MU.write row j (elm+1) return () fn bipMembs = -- Here we quadratically consider all pairs of trees and ask whether -- their edit distance is increased based on this particular BiP. -- Actually, as an optimization, it is sufficient to consider only the -- cartesian product of those that have and those that don't. let haveIt = bipMembs -- Depending on how invertDense is written, it could be useful to -- fuse this in and deforest "dontHave". dontHave = invertDense num_trees bipMembs fn1 trId = traverseDense_ (fn2 trId) dontHave fn2 trId1 trId2 = bumpMatr trId1 trId2 in -- trace ("Computed donthave "++ show dontHave) $ traverseDense_ fn1 haveIt F.traverse_ fn bipTable v1 <- V.unsafeFreeze matr T.traverse (U.unsafeFreeze) v1 -------------------------------------------------------------------------------- -- Miscellaneous Helpers -------------------------------------------------------------------------------- instance Pretty a => Pretty (S.Set a) where pPrint s = pPrint (S.toList s) printDistMat :: Handle -> V.Vector (U.Vector Int) -> IO () printDistMat h mat = do hPutStrLn h "Robinson-Foulds distance (matrix format):" hPutStrLn h "-----------------------------------------" V.forM_ mat $ \row -> do U.forM_ row $ \elem -> do hPutStr h (show elem) hPutStr h " " hPutStr h "0\n" hPutStrLn h "-----------------------------------------" -- My own forM for numeric ranges (not requiring deforestation optimizations). -- Inclusive start, exclusive end. {-# INLINE for_ #-} for_ :: Monad m => (Int, Int) -> (Int -> m ()) -> m () for_ (start, end) _fn | start > end = error "for_: start is greater than end" for_ (start, end) fn = loop start where loop !i | i == end = return () | otherwise = do fn i; loop (i+1) -- | Which of a set of trees are compatible with a consensus? filterCompatible :: NewickTree a -> [NewickTree b] -> [NewickTree b] filterCompatible consensus trees = let cbips = allBips consensus in [ tr | tr <- trees , cbips `S.isSubsetOf` allBips tr ] -- | `compatibleWith consensus tree` -- Is a tree compatible with a consensus? -- This is more efficient if partially applied then used repeatedly. -- -- Note, tree compatibility is not the same as an exact match. It's -- like (<=) rather than (==). The "star topology" is consistent with the -- all trees, because it induces the empty set of bipartitions. compatibleWith :: NewickTree a -> NewickTree b -> Bool compatibleWith consensus = let consBips = allBips consensus in \ newTr -> S.isSubsetOf consBips (allBips newTr) -- | Consensus between two trees, which may even have different label maps. consensusTreeFull (FullTree n1 l1 t1) (FullTree n2 l2 t2) = error "FINISHME - consensusTreeFull" -- | Take only the bipartitions that are agreed on by all trees. consensusTree :: Int -> [NewickTree a] -> NewickTree () consensusTree _ [] = error "Cannot take the consensusTree of the empty list" consensusTree num_taxa (hd:tl) = bipsToTree num_taxa intersection where intersection = L.foldl' S.intersection (allBips hd) (map allBips tl) -- intersection = loop (allBips hd) tl -- loop :: S.Set DenseLabelSet -> [NewickTree a] -> S.Set DenseLabelSet -- loop !remain [] = remain -- -- Was attempting to use foldBips here as an optimization: -- -- loop !remain (hd:tl) = loop (foldBips S.delete hd remain) tl -- loop !remain (hd:tl) = loop (S.difference remain (allBips hd)) tl -- | Convert from bipartitions BACK to a single tree. bipsToTree :: Int -> S.Set DenseLabelSet -> NewickTree () bipsToTree num_taxa origbip = -- trace ("Doing bips in order: "++show sorted++"\n") $ loop lvl0 sorted where -- We consider each subset in increasing size order. -- FIXME: If we tweak the order on BIPs, then we can just use S.toAscList here: sorted = L.sortBy (compare `on` bipSize) (S.toList origbip) lvl0 = [ (mkSingleDense num_taxa ix, NTLeaf () ix) | ix <- [0..num_taxa-1] ] -- VERY expensive! However, due to normalization issues this is necessary for now: -- TODO: in the future make it possible to definitively denormalize. -- isMatch bip x = denseIsSubset x bip || denseIsSubset x (invertDense num_taxa bip) isMatch bip x = denseIsSubset x bip -- We recursively glom together subtrees until we have a complete tree. -- We only process larger subtrees after we have processed all the smaller ones. loop !subtrees [] = case subtrees of [] -> error "bipsToTree: internal error" [(_,one)] -> one lst -> NTInterior () (map snd lst) loop !subtrees (bip:tl) = -- trace (" -> looping, subtrees "++show subtrees) $ let (in_,out) = L.partition (isMatch bip. fst) subtrees in case in_ of [] -> error $"bipsToTree: Internal error! No match for bip: "++show bip ++" out is\n "++show out++"\n and remaining bips "++show (length tl) ++"\n when processing orig bip set:\n "++show origbip -- loop out tl _ -> -- Here all subtrees that match the current bip get merged: loop ((denseUnions num_taxa (map fst in_), NTInterior () (map snd in_)) : out) tl PhyBin-0.3/Bio/Phylogeny/PhyBin/Util.hs000066400000000000000000000133101221636131000176670ustar00rootroot00000000000000{-# LANGUAGE ScopedTypeVariables #-} -- RecordWildCards, TypeSynonymInstances, CPP -- {-# LANGUAGE NamedFieldPuns #-} -- {-# LANGUAGE OverloadedStrings #-} -- {-# OPTIONS_GHC -fwarn-incomplete-patterns #-} -- {-# OPTIONS_GHC -fwarn-unused-imports #-} -- | This module contains misc bits used by (multiple) other modules. module Bio.Phylogeny.PhyBin.Util ( is_regular_file, acquireTreeFiles, safePrintDendro, sanityCheck ) where import qualified Data.Foldable as F import Data.Function (on) import Data.List (delete, minimumBy, sortBy, insertBy, intersperse, sort) import Data.Maybe (fromMaybe, catMaybes) import qualified Data.Map as M import qualified Data.Set as S import qualified Data.Text.Lazy as T import Control.Monad (forM, forM_, filterM, when, unless) import Control.Exception (evaluate) import Control.Applicative ((<$>),(<*>)) import Control.Concurrent (Chan) import System.FilePath (combine) import System.Directory (doesFileExist, doesDirectoryExist, getDirectoryContents, getCurrentDirectory) import System.IO (openFile, hClose, IOMode(ReadMode), stderr, hPutStr, hPutStrLn) import System.Exit (ExitCode(..)) import System.Timeout (timeout) import Test.HUnit ((~:),(~=?),Test,test) -- For vizualization: import Text.PrettyPrint.HughesPJClass hiding (char, Style) import Bio.Phylogeny.PhyBin.CoreTypes -- import Bio.Phylogeny.PhyBin.Parser (parseNewick) -- import Bio.Phylogeny.PhyBin.Visualize (dotToPDF, dotNewickTree, viewNewickTree) import qualified Data.Clustering.Hierarchical as C import qualified Data.Graph.Inductive as G import qualified Data.GraphViz as Gv import Data.GraphViz.Printing (renderDot) import Data.GraphViz.Types.Canonical (nodeStmts, graphStatements) ---------------------------------------------------------------------------------------------------- -- OS specific bits: ---------------------------------------------------------------------------------------------------- -- #ifdef WIN32 -- is_regular_file = undefined -- is_directory path = -- getFileAttributes -- --getFileInformationByHandle -- -- bhfiFileAttributes -- file_exists = undefined -- #else -- is_regular_file :: FilePath -> IO Bool -- is_regular_file file = -- do stat <- getFileStatus file; -- -- Hmm, this is probably bad practice... hard to know its exhaustive: -- return$ isRegularFile stat || isNamedPipe stat || isSymbolicLink stat -- is_directory :: FilePath -> IO Bool -- is_directory path = -- do stat <- getFileStatus path -- return (isDirectory stat) -- file_exists = fileExist -- #endif -- Here we ASSUME it exists, then these functions are good enough: is_regular_file :: FilePath -> IO Bool is_regular_file = doesFileExist is_directory :: FilePath -> IO Bool is_directory = doesDirectoryExist file_exists :: FilePath -> IO Bool file_exists path = do f <- doesFileExist path d <- doesDirectoryExist path return (f || d) -------------------------------------------------------------------------------- -- | Expand out directories to find all the tree files. acquireTreeFiles :: [String] -> IO [String] acquireTreeFiles inputs = do all :: [[String]] <- forM inputs $ \ path -> do exists <- file_exists path --stat <- if exists then getFileStatus path else return (error "internal invariant") -- [2010.09.23] This is no longer really necessary: if not exists then do error$ "No file or directory found at this path!: "++path -- hPutStr stderr$ "Input not a file/directory, assuming wildcard, using 'find' for expansion" -- entries <- HSH.run$ "find " ++ path -- hPutStrLn stderr$ "("++show (length entries)++" files found): "++ show path -- return entries else do isdir <- is_directory path reg <- is_regular_file path if isdir then do hPutStr stderr$ "Input is a directory, reading all regular files contained " children <- getDirectoryContents path filtered <- filterM is_regular_file $ map (combine path) children hPutStrLn stderr$ "("++show (length filtered)++" regular files found): "++ show path return$ filtered else if reg then do return [path] else error$ "phybin: Unhandled input path: " ++ path return (concat all) -------------------------------------------------------------------------------- -- | Step carefully in case of cycles (argh). safePrintDendro :: Gv.DotGraph G.Node -> IO (Maybe T.Text) safePrintDendro dotg= do -- putStrLn$ "Dendrogram graph size: "++ show (F.foldl' (\a _ -> a+1) 0 dotg) mx <- timeout (2 * 1000 * 1000) $ do -- putStrLn$ "Dendrogram graph, is directed?: "++ show (Gv.directedGraph dotg) putStrLn$ "Dendrogram graph size: "++ show (length $ nodeStmts $ graphStatements dotg) let str = renderDot $ Gv.toDot dotg evaluate (T.length str) return str case mx of Nothing -> do putStrLn "WARNING: DotGraph appears to be a cyclic structure. This is probably a bug." return Nothing _ -> return mx sanityCheck :: C.Dendrogram (FullTree DefDecor) -> IO () sanityCheck dendro = do let fn seen elm | S.member (treename elm) seen = error$"Dendrogram failed sanity check! Tree name occurs multiple times: "++(treename elm) | otherwise = S.insert (treename elm) seen sz = S.size $ F.foldl' fn S.empty dendro putStrLn$ "Sanity checked dendrogram of size: "++show sz PhyBin-0.3/Bio/Phylogeny/PhyBin/Visualize.hs000066400000000000000000000424071221636131000207360ustar00rootroot00000000000000{-# LANGUAGE NamedFieldPuns #-} module Bio.Phylogeny.PhyBin.Visualize (dotNewickTree, dotToPDF, viewNewickTree, dotNewickTree_debug, -- * Dendrogram visualization dendrogramToGraph, dotDendrogram ) where import Text.Printf (printf) import Data.List (elemIndex, isPrefixOf) import Data.List.Split (chunksOf) import Data.Maybe (fromJust, isJust) import qualified Data.Map as M import qualified Data.Set as S import qualified Data.Vector as V import Data.Text.Lazy (pack) import Control.Monad (void) import Control.Concurrent (Chan, newChan, writeChan, forkIO) import qualified Data.Graph.Inductive as G hiding (run) import qualified Data.GraphViz as Gv hiding (parse, toLabel) import qualified Data.GraphViz.Attributes.Complete as GA import qualified Data.GraphViz.Attributes.Colors as GC import Data.GraphViz.Attributes.Colors (Color(RGB)) -- import Test.HUnit ((~:),(~=?),Test,test) import System.Timeout (timeout) import qualified Data.Clustering.Hierarchical as C import Bio.Phylogeny.PhyBin.CoreTypes import Bio.Phylogeny.PhyBin.RFDistance (filterCompatible, compatibleWith, consensusTree) import Debug.Trace ---------------------------------------------------------------------------------------------------- -- Visualization with GraphViz and FGL: ---------------------------------------------------------------------------------------------------- -- First we need to be able to convert our trees to FGL graphs: toGraph :: FullTree StandardDecor -> G.Gr String Double toGraph (FullTree _ tbl tree) = G.run_ G.empty $ loop tree where fromLabel ix = tbl M.! ix loop (NTLeaf _ name) = do let str = fromLabel name _ <- G.insMapNodeM str return str loop (NTInterior (StandardDecor{sortedLabels}) ls) = do let bigname = concatMap fromLabel sortedLabels names <- mapM loop ls _ <- G.insMapNodeM bigname mapM_ (\x -> G.insMapEdgeM (bigname, x, 0.0)) names return bigname -- This version uses the tree nodes themselves as graph labels. toGraph2 :: FullTree StandardDecor -> G.Gr (NewickTree StandardDecor) Double toGraph2 (FullTree _ tbl tree) = G.run_ G.empty $ loop tree where loop node@(NTLeaf _ _) = do _ <- G.insMapNodeM node return () loop node@(NTInterior _ ls) = do mapM_ loop ls _ <- G.insMapNodeM node -- Edge weights as just branchLen (not bootstrap): mapM_ (\x -> G.insMapEdgeM (node, x, branchLen$ get_dec x)) ls return () ---------------------------------------------------------------------------------------------------- -- Dendrograms ---------------------------------------------------------------------------------------------------- -- | Convert to a dotGraph. Some duplicated code with dotNewickTree. dotDendrogram :: PhyBinConfig -> String -> Double -> C.Dendrogram (FullTree a) -> Maybe (M.Map TreeName Int) -> [[NewickTree ()]] -> Gv.DotGraph G.Node dotDendrogram PBC{show_trees_in_dendro, show_interior_consensus} title edge_scale origDendro mNameMap highlightTrs = Gv.graphToDot myparams (G.nmap uid graph) where (charsDropped, dendro) = truncateNames origDendro -- This is ugly, but we modify the name map to match: nameMap' = fmap (M.mapKeys (drop charsDropped)) mNameMap graph :: DendroGraph graph = dendrogramToGraph num_taxa dendro num_taxa = numLeaves $ nwtree fstLeaf FullTree{labelTable} = fstLeaf fstLeaf = getFstLeaf dendro getFstLeaf (C.Leaf ft) = ft getFstLeaf (C.Branch _ l _) = getFstLeaf l uidsToNames = M.fromList $ map (\nd@NdLabel{uid} -> (uid,nd)) $ -- filter (isJust . tre) $ map (fromJust . G.lab graph) $ G.nodes graph matchers = map mkMatcher highlightTrs mkMatcher ls = let fns = map compatibleWith ls -- Multiple predicate functions in \ tr -> -- Does it match any predicate? or$ map (\f -> f tr) fns wcolors = zip matchers defaultPalette findColor tr = loop wcolors where loop [] = Nothing loop ((f,col):rst) | f tr = Just col | otherwise = loop rst -- Map uids to highlight color highlightMap = M.map fromJust $ M.filter isJust $ M.map (\ (NdLabel _ _ _ ct) -> findColor ct) uidsToNames myparams :: Gv.GraphvizParams G.Node String Double () String -- (Either String String) myparams = Gv.defaultParams { Gv.globalAttributes= [Gv.GraphAttrs [GA.Label$ GA.StrLabel$ pack title]], Gv.fmtNode= nodeAttrs, Gv.fmtEdge= edgeAttrs } nodeAttrs :: (Int, UniqueNodeName) -> [GA.Attribute] nodeAttrs (_num, uid) = let uid' = if show_trees_in_dendro then printed_tree++"\n"++uid else uid printed_tree = case M.lookup uid uidsToNames of Nothing -> "" Just NdLabel{clumpSize,consensus} -> (if clumpSize>1 then "size "++show clumpSize++"\n" else "") ++ show (displayStrippedTree (FullTree "" labelTable consensus)) (tag,shp,styl) = -- case eith of if isPrefixOf "DUMMY_" uid then (if show_trees_in_dendro && show_interior_consensus then printed_tree else "", if show_interior_consensus then GA.BoxShape -- GA.PlainText else GA.PointShape, [ GA.Color [weighted$ GA.X11Color Gv.Transparent] ] ) else (uid', GA.Ellipse, [ GA.Style [GA.SItem GA.Filled []]]) highlightColor = case M.lookup uid highlightMap of Nothing -> [] Just col -> [ GA.Color [weighted col ] ] clustColor | not (null highlightTrs) = [] | otherwise = case (nameMap', M.lookup uid uidsToNames) of (Just nm, Just NdLabel{tre=Just FullTree{treename}}) -> case M.lookup treename nm of Nothing -> [] -- Here we color the top TEN clusters: Just ind | ind <= 10 -> [ GA.Color [weighted$ defaultPaletteV V.! (ind-1) ] ] | otherwise -> [] -- TODO: shall we color intermediate nodes? _ -> [] in [ GA.Label$ GA.StrLabel$ pack tag , GA.Shape shp ] ++ styl ++ highlightColor ++ clustColor edgeAttrs = getEdgeAttrs edge_scale type UniqueNodeName = String -- type DendroGraph = G.Gr (Maybe TreeName,UniqueNodeName) Double type DendroGraph = G.Gr NdLabel Double -- | When we first convert to a graph representation, there is a bunch of information -- hanging off of each node. data NdLabel = NdLabel { uid :: UniqueNodeName , tre :: Maybe (FullTree ()) , clumpSize :: !Int , consensus :: NewickTree () } deriving (Show, Ord, Eq) -- | Create a graph using TreeNames for node labels and edit-distance for edge labels. dendrogramToGraph :: Int -> C.Dendrogram (FullTree a) -> DendroGraph dendrogramToGraph num_taxa orig = G.run_ G.empty $ void$ loop (fmap (fmap (const())) orig) where -- loop node@(C.Leaf ft) = G.insMapNodeM (NdLabel (treename ft) (Just$ fmap (const()) ft) 1 ft) loop node@(C.Leaf ft) = let stripped = fmap (const()) ft in G.insMapNodeM (NdLabel (treename ft) (Just stripped) 1 (nwtree stripped)) loop node@(C.Branch 0 left right) = do -- As a preprocessing step we collapse clusters that are separated by zero edit distance. ---------------------------------------- let lvs = collapseZeroes left ++ collapseZeroes right nms = map (treename) lvs lens = map length nms total = sum lens avg = total `quot` length nms -- The goal here is to make an approximately square arrangement: -- goal: avg * perline == total / (avg * perline) perline = ceiling$ sqrt (fromIntegral total / ((fromIntegral avg)^2)) chunked = chunksOf perline nms fatname = unlines (map unwords chunked) G.insMapNodeM (NdLabel fatname (Just (head lvs)) (length lvs) (consensusTree num_taxa (map nwtree lvs))) ---------------------------------------- loop node@(C.Branch dist left right) = do (_,ll@(NdLabel lid _ s1 c1)) <- loop left (_,rr@(NdLabel rid _ s2 c2)) <- loop right -- Interior nodes do NOT have their names drawn: let ndname = "DUMMY_"++(lid++"_"++rid) -- HACK! (midN,mid) <- G.insMapNodeM (NdLabel ndname Nothing (s1+s2) (consensusTree num_taxa [c1,c2])) G.insMapEdgeM (ll, mid, dist) G.insMapEdgeM (rr, mid, dist) return (midN,mid) collapseZeroes (C.Leaf tr) = [tr] collapseZeroes (C.Branch 0 l r) = collapseZeroes l ++ collapseZeroes r collapseZeroes oth = error "dendrogramToGraph: internal error. Not expecting non-zero branch length here." -- | The plot looks nicer when the names aren't bloated with repeated stuff. This -- replaces all tree names with potentially shorter names by removing the common prefix. -- Returns how many characters were dropped. truncateNames :: C.Dendrogram (FullTree a) -> (Int, C.Dendrogram (FullTree a)) truncateNames dendro = (prefChars, fmap chopName dendro) where chopName ft = ft{ treename= drop prefChars (treename ft) } prefChars = length$ commonPrefix$ S.toList$ allNames dendro allNames (C.Leaf tr) = S.singleton (treename tr) allNames (C.Branch _ l r) = S.union (allNames l) (allNames r) ---------------------------------------------------------------------------------------------------- -- | Open a GUI window to displaya tree. -- -- Fork a thread that then runs graphviz. -- The channel retuned will carry a single message to signal -- completion of the subprocess. viewNewickTree :: String -> FullTree StandardDecor -> IO (Chan (), FullTree StandardDecor) -- TODO: UPDATE THIS TO RETURN AN ASYNC!! viewNewickTree title tree@(FullTree{nwtree}) = do chan <- newChan let dot = dotNewickTree title (1.0 / avg_branchlen [nwtree]) tree runit = do mx <- -- timeout defaultTimeout $ -- This one is interactive, so we don't need a timeout. Gv.runGraphvizCanvas default_cmd dot Gv.Xlib -- case mx of -- Nothing -> putStrLn$ "WARNING: call to graphviz TIMED OUT."++file -- _ -> return () writeChan chan () --str <- prettyPrint d --putStrLn$ "Generating the following graphviz tree:\n " ++ str _ <- forkIO runit --do runGraphvizCanvas Dot dot Xlib; return () return (chan, tree) --default_cmd = TwoPi -- Totally ignores edge lengths. default_cmd :: Gv.GraphvizCommand default_cmd = Gv.Neato -- Show a float without scientific notation: myShowFloat :: Double -> String -- showFloat weight = showEFloat (Just 2) weight "" myShowFloat fl = let rnd = round fl in if fl == fromIntegral rnd then show rnd else printf "%.4f" fl -- | Convert a .dot file to .pdf. dotToPDF :: Gv.DotGraph G.Node -> FilePath -> IO (Maybe FilePath) dotToPDF dot file = do -- If we have any problem with graphviz we want to time that out rather than let -- our whole run hang: x <- timeout defaultTimeout $ Gv.runGraphvizCommand default_cmd dot Gv.Pdf file case x of Nothing -> do putStrLn$ "WARNING: call to graphviz TIMED OUT. File not plotted: "++file return Nothing _ -> return x -- Arbitrary: 15 second timeout. defaultTimeout :: Int defaultTimeout = (15 * 1000 * 1000) -- | Convert a NewickTree to a graphviz Dot graph representation. dotNewickTree :: String -> Double -> FullTree StandardDecor -> Gv.DotGraph G.Node dotNewickTree title edge_scale atree@(FullTree _ tbl tree) = --trace ("EDGE SCALE: " ++ show edge_scale) $ Gv.graphToDot myparams graph where graph = toGraph2 atree fromLabel ix = tbl M.! ix myparams :: Gv.GraphvizParams G.Node (NewickTree StandardDecor) Double () (NewickTree StandardDecor) myparams = Gv.defaultParams { Gv.globalAttributes= [Gv.GraphAttrs [GA.Label$ GA.StrLabel$ pack title]], Gv.fmtNode= nodeAttrs, Gv.fmtEdge= edgeAttrs } nodeAttrs :: (Int,NewickTree StandardDecor) -> [GA.Attribute] nodeAttrs (_num, node) = let children = get_children node in [ GA.Label$ GA.StrLabel$ pack$ concatMap fromLabel $ sortedLabels $ get_dec node , GA.Shape (if null children then {-PlainText-} GA.Ellipse else GA.PointShape) , GA.Style [GA.SItem GA.Filled []] ] edgeAttrs = getEdgeAttrs edge_scale -- | Some arbitrarily chosen colors from the X11 set: defaultPaletteV :: V.Vector GA.Color defaultPaletteV = V.fromList defaultPalette defaultPalette :: [GA.Color] defaultPalette = concat$ replicate 4 $ map GA.X11Color [ Gv.Aquamarine , Gv.PaleVioletRed , Gv.MediumPurple , Gv.PaleGreen , Gv.PapayaWhip , Gv.SkyBlue , Gv.Yellow , Gv.Crimson , Gv.Gray , Gv.PaleGoldenrod ] -- Grabbed the first couple palettes off a website: altPalette :: V.Vector GA.Color altPalette = V.fromList $ concat $ replicate 3 $ -- http://www.colourlovers.com/palette/2962435/Autumn_Roses [ RGB 159 74 81, RGB 217 183 173, RGB 149 91 116, RGB 185 138 148 -- , RGB 101 69 82 -- too dark ] ++ -- http://www.colourlovers.com/palette/2962434/Earthy_warm [ RGB 108 74 39, RGB 207 179 83, RGB 180 149 60, RGB 244 242 185 -- , RGB 61 63 39 ] getEdgeAttrs :: Double -> (t, t1, Double) -> [GA.Attribute] getEdgeAttrs edge_scale = edgeAttrs where -- TOGGLE: -- edgeAttrs (_,_,weight) = [ArrowHead noArrow, Len (weight * edge_scale + bump), GA.Label (StrLabel$ show (weight))] edgeAttrs (_,_,weight) = let draw_weight = compute_draw_weight weight edge_scale in --trace ("EDGE WEIGHT "++ show weight ++ " drawn at "++ show draw_weight) $ [GA.ArrowHead Gv.noArrow, GA.Label$ GA.StrLabel$ pack$ myShowFloat weight] ++ -- TEMPTOGGLE --[ArrowHead noArrow, GA.Label (StrLabel$ show draw_weight)] ++ -- TEMPTOGGLE if weight == 0.0 then [GA.Color [weighted$ GA.X11Color Gv.Red], GA.LWidth 3.0, GA.Len minlen] else [GA.Len draw_weight] minlen = 0.7 maxlen = 3.0 compute_draw_weight w scale = let scaled = (abs w) * scale + minlen in -- Don't draw them too big or it gets annoying: (min scaled maxlen) weighted c = GC.WC {GC.wColor=c, GC.weighting=Nothing} -- | This version shows the ordered/rooted structure of the normalized tree. dotNewickTree_debug :: String -> FullTree StandardDecor -> Gv.DotGraph G.Node dotNewickTree_debug title atree@(FullTree _ tbl tree) = Gv.graphToDot myparams graph where graph = toGraph2 atree fromLabel ix = tbl M.! ix myparams :: Gv.GraphvizParams G.Node (NewickTree StandardDecor) Double () (NewickTree StandardDecor) myparams = Gv.defaultParams { Gv.globalAttributes= [Gv.GraphAttrs [GA.Label$ GA.StrLabel$ pack title]], Gv.fmtNode= nodeAttrs, Gv.fmtEdge= edgeAttrs } nodeAttrs :: (Int,(NewickTree StandardDecor)) -> [GA.Attribute] nodeAttrs (num,node) = let children = get_children node in [ GA.Label (if null children then GA.StrLabel$ pack$ concatMap fromLabel $ sortedLabels $ get_dec node else GA.RecordLabel$ take (length children) $ -- This will leave interior nodes unlabeled: map (GA.PortName . GA.PN . pack) $ map show [1..] -- This version gives some kind of name to interior nodes: -- map (\ (i,ls) -> LabelledTarget (PN$ show i) (fromLabel$ head ls)) $ -- zip [1..] (map (thd3 . get_dec) children) ) , GA.Shape GA.Record , GA.Style [GA.SItem GA.Filled []] ] edgeAttrs (num1, num2, _weight) = let node1 = fromJust$ G.lab graph num1 node2 = fromJust$ G.lab graph num2 ind = fromJust$ elemIndex node2 (get_children node1) in [GA.TailPort$ GA.LabelledPort (GA.PN$ pack$ show$ 1+ind) (Just GA.South)] -------------------------------------------------------------------------------- -- Unit Testing -------------------------------------------------------------------------------- -- tt0 :: IO (Chan (), FullTree) -- tt0 = drawNewickTree "tt0" $ annotateWLabLists $ parseNewick "" "(A,(C,D,E),B);" -- tt3 :: IO (Chan (), FullTree) -- tt3 = drawNewickTree "tt3" tt -- tt4 :: IO (Chan (), FullTree) -- tt4 = drawNewickTree "tt4"$ trace ("FINAL: "++ show (pPrint norm4)) $ norm4 -- tt5 :: IO (Chan (), FullTree) -- tt5 = drawNewickTree "tt5"$ norm5 -- tt5' :: String -- tt5' = prettyPrint' $ dotNewickTree "norm5" 1.0 norm5 -- ttall :: IO (Chan (), FullTree) -- ttall = do tt3; tt4; tt5 -- tt2 :: G.Gr String Double -- tt2 = toGraph tt -- unitTests :: Test -- unitTests = test -- [ -- ] -- TEMP / HACK: prettyPrint' :: Show a => a -> String prettyPrint' = show -- | Common prefix of a list of lists. commonPrefix :: Eq a => [[a]] -> [a] commonPrefix [] = [] commonPrefix ls@(hd:tl) | any null ls = [] | otherwise = if all ((== (head hd)) . head) tl then head hd : commonPrefix (map tail ls) else commonPrefix (map tail ls) PhyBin-0.3/DEVLOG.md000066400000000000000000000076521221636131000140540ustar00rootroot00000000000000 [2012.11.19] ------------ Right now the rickettsia dataset is causing problems: phybin.exe: Rickettsia/phybin_output/bin1_16.txt: openFile: does not exist (No such file or directory) [2013.07.22] {Beginning optimization of all-to-all RF distance} --------------------------------------------------------------- Making the labels into Ints and switching to IntSet has sped up the 150-taxa/100-trees example from 3 sec to 1.6 seconds on my macbook retina. [2013.07.23] {Debugging} ------------------------ Ok, clustering on RFDistance with --editdist 0 should give the same answers as phybin --bin, but it's not! On Irene's current Wolbachia dataset, traditional phybin --bin comes up with this as its top cluster: RAxML_bipartitions.88 RAxML_bipartitions.506 RAxML_bipartitions.476 RAxML_bipartitions.39 RAxML_bipartitions.386 RAxML_bipartitions.321 RAxML_bipartitions.3 RAxML_bipartitions.276 RAxML_bipartitions.260 RAxML_bipartitions.256 RAxML_bipartitions.233 RAxML_bipartitions.197 RAxML_bipartitions.171 RAxML_bipartitions.165 RAxML_bipartitions.141 RAxML_bipartitions.116 By contrast, --UPGMA --editdist0 comes up with this: RAxML_bipartitions.120 RAxML_bipartitions.167 RAxML_bipartitions.250 RAxML_bipartitions.38 RAxML_bipartitions.444 RAxML_bipartitions.494 This is with no branch-collapsing... Ugh, that's no ovelap at all. If we look at the second bin for --UPGMA though, we see overlap: RAxML_bipartitions.233 RAxML_bipartitions.3 RAxML_bipartitions.321 RAxML_bipartitions.39 RAxML_bipartitions.476 RAxML_bipartitions.88 Ok, at least one is a subset of the other this time... and here's the diff: RAxML_bipartitions.116 RAxML_bipartitions.141 RAxML_bipartitions.165 RAxML_bipartitions.171 RAxML_bipartitions.197 RAxML_bipartitions.256 RAxML_bipartitions.260 RAxML_bipartitions.276 RAxML_bipartitions.386 RAxML_bipartitions.506 And here's the tree that phybin --bin produced: ((14, 3_), (19, (5_, 13)), ((1_, 2_), (7_, (18, 6_)))); I singled out 116 and 233, and then printed their normalized forms using "phybin -p 2 --printnorms tests/t2/*": Tree "116" ((1_, 2_), (7_, (18, 6_)), ((14, 3_), (19, (13, 5_)))) Tree "233" ((1_, 2_), (7_, (18, 6_)), ((14, 3_), (19, (13, 5_)))) And UNNORMALIZED: Tree "116" (3_, ((((6_, 18), 7_), (2_, 1_)), (19, (13, 5_))), 14) Tree "233" (5_, (19, ((3_, 14), ((2_, 1_), (7_, (6_, 18))))), 13) Wait.... this makes it look like normalization is WRONG!!! Tree 233 starts with 13+5 in a three-way split with a big chunk containing everything else. In fact, the RFDistance is three, which is probably right. [2013.07.23] {Updating set reps, timing} ---------------------------------------- Ok, went ahead and switched over to unboxed bitvectors for DenseLabelSet. It's 10X slower! The BITVEC_BIPS option is getting the right answer now though. Apparently the normalization bug just fixed was the only bug. For 100 trees, 150 taxa (from the hashrf suite): * IntSet takes 2.2s, 7.2G alloc, 114M copied. * bit vec takes 21s and 69G alloc, 264M copied. Productivities are high in both cases. Oops... there was a bug in bipSize. I'm surprised it was getting the right answer. Fixing that brought it down to 39G alloc, 12.3s. Stil, bit vectors are NOT a win for the direct/simple N^2 version. [2013.07.25] {Comparison ogainst HashRF} ---------------------------------------- Note that HashRF doesn't have very good error messages. For example, if given our messy data with sum trees missing taxa: ./hashrf Wolbachia/all_trees.tr 928 *** Collecting the taxon labels *** Number of taxa = 10 *** Reading tree file and collecting bipartitions *** libc++abi.dylib: terminate called throwing an exception Abort trap: 6 In fact... even if I use phybin to prune out the 503 trees with 10 taxa, I still get the error. PhyBin-0.3/LICENSE000066400000000000000000000032701221636131000135470ustar00rootroot00000000000000NOTE: This License applies to all files in the "Haskell CnC" distribution, irrespective of whether the license is appended at the top of each file. BSD STANDARD THREE CLAUSE LICENSE ("New BSD License") Copyright (c) 2010, Intel Corporation. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PhyBin-0.3/Main.hs000066400000000000000000000745511221636131000137740ustar00rootroot00000000000000{-# LANGUAGE RecordWildCards, TupleSections, NamedFieldPuns #-} {-# OPTIONS_GHC -fwarn-unused-imports -fwarn-incomplete-patterns #-} -- | The MAIN module, of course. This is the script that deals with -- command line options and calling into the heart of the beast. module Main where import Data.List (sort, intersperse, foldl') import Data.List.Split (splitOneOf) import qualified Data.ByteString.Char8 as B import qualified Data.Map as M import qualified Data.Set as S import Control.Monad import Control.Concurrent (readChan) import System.Environment (getArgs, withArgs) import System.Console.GetOpt (OptDescr(Option), ArgDescr(..), ArgOrder(..), usageInfo, getOpt) import System.Exit (exitSuccess) import System.IO (stdout) import Control.Exception (catch, SomeException) import qualified Data.Vector as V import Test.HUnit as HU import Bio.Phylogeny.PhyBin.CoreTypes import Bio.Phylogeny.PhyBin (driver, normalize, annotateWLabLists, unitTests, acquireTreeFiles, deAnnotate, retrieveHighlights, matchAnyHighlight) import Bio.Phylogeny.PhyBin.Parser (parseNewick, parseNewickFiles, unitTests) import Bio.Phylogeny.PhyBin.Visualize (viewNewickTree) -- dotNewickTree_debug import Bio.Phylogeny.PhyBin.RFDistance import Bio.Phylogeny.PhyBin.PreProcessor import qualified Data.Clustering.Hierarchical as C import Data.Version (showVersion) import Paths_phybin (version) ---------------------------------------------------------------------------------------------------- -- MAIN script: Read command line options and call the program. ---------------------------------------------------------------------------------------------------- -- Note: ORDER is important here, we process options in this order: data Flag = Verbose | Version | Output String | SetNumTaxa Int | BranchThresh Double | BootStrapThresh Int | PruneTaxa [String] | NullOpt | Graph | Draw | Force | View | TabDelimited Int Int | Highlight FilePath | ShowTreesInDendro | ShowInterior | RFMode WhichRFMode | SelfTest | RFMatrix | LineSetDiffMode | PrintNorms | PrintReg | PrintConsensus | PrintMatching | Cluster C.Linkage | BinningMode | EditDistThresh Int | DendogramOnly | NameCutoff String | NamePrefix Int | NameTable String -- Must come after Cutoff/Prefix deriving (Show, Eq, Ord) -- ORD is really used. parseTabDelim :: String -> Flag parseTabDelim _str = TabDelimited 8 9 options :: [OptDescr Flag] options = [ Option ['v'] ["verbose"] (NoArg Verbose) "print WARNINGS and other information (recommended at first)" , Option ['V'] ["version"] (NoArg Version) "show version number" , Option ['o'] ["output"] (ReqArg Output "DIR") "set directory to contain all output files (default \"./phybin_out/\")" , Option [] ["selftest"] (NoArg SelfTest) "run internal unit tests" {- -- TODO: FIXME: IMPLEMENT THIS: , Option [] [] (NoArg NullOpt) "" , Option ['t'] ["tabbed"] (ReqArg parseTabDelim "NUM1:NUM2")$ "assume the input is a ab-delimited file with gene names \n"++ "in column NUM1 and Newick trees in NUM2" -} , Option [] [] (NoArg NullOpt) "" , Option [] [] (NoArg$ error "internal problem") "----------------------------- Clustering Options ------------------------------" , Option [] ["bin"] (NoArg BinningMode)$ "Use simple binning, the cheapest form of 'clustering'" , Option [] ["single"] (NoArg$ Cluster C.SingleLinkage) $ "Use single-linkage clustering (nearest neighbor)" , Option [] ["complete"] (NoArg$ Cluster C.CompleteLinkage)$ "Use complete-linkage clustering (furthest neighbor)" , Option [] ["UPGMA"] (NoArg$ Cluster C.UPGMA) $ "Use Unweighted Pair Group Method (average linkage) - DEFAULT mode" , Option [] ["editdist"] (ReqArg (EditDistThresh . read) "DIST")$ "Combine all clusters separated by DIST or less. Report a flat list of clusters.\n"++ "Irrespective of whether this is activated, a hierarchical clustering (dendogram.pdf) is produced." -- , Option [] ["dendogram"] (NoArg DendogramOnly)$ "Report a hierarchical clustering (default)" , Option [] [] (NoArg NullOpt) "" , Option [] [] (NoArg$ error "internal problem") "--- Select Robinson-Foulds (symmetric difference) distance algorithm: ---" , Option [] ["hashrf"] (NoArg$ RFMode HashRF) "(default) use a variant of the HashRF algorithm for the distance matrix" , Option [] ["tolerant"] (NoArg$ RFMode TolerantNaive) "use a slower, modified RF metric that tolerates missing taxa" , Option [] [] (NoArg NullOpt) "" , Option [] [] (NoArg$ error "internal problem") "----------------------------- Visualization --------------------------------" , Option ['g'] ["graphbins"] (NoArg Graph) "use graphviz to produce .dot and .pdf output files" -- TODO: Produce the consensus tree as well as the individual trees. , Option ['d'] ["drawbins"] (NoArg Draw) "like -g, but open GUI windows to show each bin's tree" , Option ['w'] ["view"] (NoArg View)$ "for convenience, \"view mode\" simply displays input Newick files without binning" , Option [] ["showtrees"] (NoArg ShowTreesInDendro) "Print (textual) tree topology inside the nodes of the dendrogram" , Option [] ["highlight"] (ReqArg Highlight "FILE") $ "Highlight nodes in the tree-of-trees (dendrogram) consistent with the.\n"++ "given tree file. Multiple highlights are permitted and use different colors." -- "Highlights may be SUBTREES that use any subset of the taxa." , Option [] ["interior"] (NoArg ShowInterior) "Show the consensus trees for interior nodes in the dendogram, rather than just points." , Option [] [] (NoArg NullOpt) "" , Option [] [] (NoArg$ error "internal problem") "---------------------------- Tree pre-processing -----------------------------" , Option [] ["prune"] (ReqArg (PruneTaxa . parseTaxa) "TAXA") ("Prune trees to only TAXA before doing anything else.\n"++ "Space and comma separated lists of taxa are allowed. Use quotes.") , Option ['b'] ["minbranchlen"] (ReqArg (BranchThresh . read) "LEN") "collapse branches less than LEN" , Option [] ["minbootstrap"] (ReqArg (BootStrapThresh . read) "INT") "collapse branches with bootstrap values less than INT" -- , Option ['n'] ["numtaxa"] (ReqArg (SetNumTaxa . read) "NUM") -- "expect NUM taxa for this dataset, demand trees have all of them" , Option [] [] (NoArg NullOpt) "" , Option [] [] (NoArg$ error "internal problem") "--------------------------- Extracting taxa names ----------------------------" -- , Option ['n'] ["numtaxa"] (ReqArg (SetNumTaxa . read) "NUM") "expect NUM taxa for this dataset (otherwise it will guess)" -- TODO, FIXME: The "guessing" part doesn't actually work yet -- implement it!! -- What's a good algorithm? Insist they all have the same number? Take the mode? {- -- TODO: FIXME: IMPLEMENT THIS: , Option ['f'] ["force"] (NoArg Force) "force phybin to consume and bin trees with different numbers of taxa" -} , Option [] [] (NoArg NullOpt) "" , Option ['p'] ["nameprefix"] (ReqArg (NamePrefix . read) "NUM") $ "Leaf names in the input Newick trees can be gene names, not taxa.\n"++ "Then it is typical to extract taxa names from genes. This option extracts\n"++ "a prefix of NUM characters to serve as the taxa name." , Option [] [] (NoArg NullOpt) "" , Option ['s'] ["namesep"] (ReqArg NameCutoff "STR") $ --"\n"++ "An alternative to --nameprefix, STR provides a set of delimeter characters,\n"++ "for example '-' or '0123456789'. The taxa name is then a variable-length\n"++ "prefix of each gene name up to but not including any character in STR." , Option [] [] (NoArg NullOpt) "" , Option ['m'] ["namemap"] (ReqArg NameTable "FILE") $ "Even once prefixes are extracted it may be necessary to use a lookup table\n"++ "to compute taxa names, e.g. if multiple genes/plasmids map onto one taxa.\n"++ "This option specifies a text file with find/replace entries of the form\n"++ "\" \", which are applied AFTER -s and -p." , Option [] [] (NoArg NullOpt) "" , Option [] [] (NoArg$ error "internal problem") "--------------------------- Utility Modes ----------------------------" , Option [] ["rfdist"] (NoArg RFMatrix) "print a Robinson Foulds distance matrix for the input trees" , Option [] ["setdiff"] (NoArg LineSetDiffMode) "for convenience, print the set difference between cluster*.txt files" , Option [] ["print"] (NoArg PrintReg) "simply print out a concise form of each input tree" , Option [] ["printnorms"] (NoArg PrintNorms) "simply print out a concise and NORMALIZED form of each input tree" , Option [] ["consensus"] (NoArg PrintConsensus) "print a strict consensus tree for the inputs, then exit" , Option [] ["matching"] (NoArg PrintMatching) "print a list of tree names that match any --highlight argument" ] -- | Parse the list of taxa provided on the command line. parseTaxa :: String -> [String] parseTaxa str = filter (not . null) $ splitOneOf ",; " str usage :: String usage = "\nUsage: phybin [OPTION...] files or directories...\n\n"++ -- "MODE must be one of 'bin' or 'cluster'.\n\n" ++ "PhyBin takes Newick tree files as input. Paths of Newick files can\n"++ "be passed directly on the command line. Or, if directories are provided,\n"++ "all files in those directories will be read. Taxa are named based on the files\n"++ "containing them. If a file contains multiple trees, all are read by phybin, and\n"++ "the taxa name then includes a suffix indicating the position in the file:\n"++ " e.g. FILENAME_0, FILENAME_1, etc.\n"++ "\n"++ -- "In binning mode, Phybin output contains, at minimum, files of the form binXX_YY.tr,\n"++ -- "each containing a list of input file paths that fall into that bin. XX signifies\n"++ -- "the rank of the bin (biggest first), and YY is how many trees it contains.\n"++ -- "\n"++ "When clustering trees, Phybin computes a complete all-to-all Robinson-Foulds distance matrix.\n"++ "If a threshold distance (tree edit distance) is given, then a flat set of clusters\n"++ "will be produced in files clusterXX_YY.tr. Otherwise it produces a full dendogram.\n"++ "\n"++ "Binning mode provides an especially quick-and-dirty form of clustering.\n"++ "When running with the --bin option, only exactly equal trees are put in the same cluster.\n"++ "Tree pre-processing still applies, however: for example collapsing short branches.\n"++ "\n"++ "USAGE NOTES: \n"++ " * Currently phybin ignores input trees with the wrong number of taxa.\n"++ " * If given a directory as input phybin will assume all contained files are Newick trees.\n\n"++ "\nOptions include:\n" defaultErr :: [String] -> t defaultErr errs = error $ "ERROR!\n" ++ (concat errs ++ usageInfo usage options) -------------------------------------------------------------------------------- main :: IO () main = do argv <- getArgs (opts,files) <- case getOpt Permute options argv of (o,n,[] ) -> return (o,n) (_,_,errs) -> defaultErr errs all_inputs <- acquireTreeFiles files let process_opt cfg opt = case opt of NullOpt -> return cfg Verbose -> return cfg { verbose= True } Version -> do putStrLn$ "phybin version "++ showVersion version; exitSuccess SelfTest -> do _ <- runTestTT allUnitTests; exitSuccess RFMatrix -> return cfg { print_rfmatrix= True } ShowTreesInDendro -> return cfg { show_trees_in_dendro = True } ShowInterior -> return cfg { show_interior_consensus = True } Highlight path -> return cfg { highlights = path : highlights cfg } LineSetDiffMode -> do bss <- mapM B.readFile files case map (S.fromList . B.lines) bss of [set1,set2] -> do let [f1,f2] = files let diff = S.difference set1 set2 putStrLn$" Found "++show(S.size diff)++" lines occuring in "++f1++" but not "++f2 mapM_ B.putStrLn $ S.toList diff oth -> error $"Line set difference mode expects two files as input, got "++show(length oth) exitSuccess PrintNorms -> return cfg PrintReg -> return cfg PrintConsensus -> return cfg PrintMatching -> return cfg Cluster lnk -> return cfg { clust_mode = ClusterThem lnk } RFMode TolerantNaive | Expected _ <- num_taxa cfg -> error "should not set --numtaxa AND --tolerant" | otherwise -> return cfg { rfmode = TolerantNaive } RFMode HashRF -> return cfg { rfmode = HashRF } BinningMode -> return cfg { clust_mode = BinThem } EditDistThresh n -> return cfg { dist_thresh = Just n } DendogramOnly -> return cfg { dist_thresh = Nothing } Output s -> return cfg { output_dir= s } SetNumTaxa n | rfmode cfg == TolerantNaive -> error "should not set --tolerant AND --numtaxa" | otherwise -> return cfg { num_taxa= Expected n } BranchThresh n -> return cfg { branch_collapse_thresh = Just n } BootStrapThresh n -> return cfg { bootstrap_collapse_thresh = Just n } PruneTaxa ls -> return cfg { preprune_labels = Just ls } Graph -> return cfg { do_graph= True } Draw -> return cfg { do_draw = True } View -> return cfg -- Handled below TabDelimited _ _ -> error "tabbed option not handled yet" Force -> error "force option not handled yet" NameCutoff str -> let chopper = takeWhile (not . flip S.member (S.fromList str)) in return cfg { name_hack = chopper . name_hack cfg } NamePrefix n -> return cfg { name_hack = (take n) . name_hack cfg } -- This should always be after cutoff/prefix: NameTable file -> do reader <- name_table_reader file return cfg { name_hack = reader . name_hack cfg } config <- foldM process_opt default_phybin_config{ inputs= all_inputs } (sort opts) -- NOTE: options processed in sorted order. when (null files) $ do defaultErr ["No file arguments!\n"] -------------------------------------------------------------------- ----- Handle SPECIAL modes before going into normal processing ----- -------------------------------------------------------------------- -- This mode kicks in AFTER config options are processed. when (elem PrintReg opts) $ do (_,fts) <- parseNewickFiles (name_hack config) all_inputs forM_ fts $ \ ft@(FullTree name _ _) -> do putStrLn $ "Tree "++show name putStrLn$ show$ displayDefaultTree ft exitSuccess ------------------------------------------------------------ when (elem PrintNorms opts) $ do (_,fts) <- parseNewickFiles (name_hack config) all_inputs forM_ fts $ \ ft@(FullTree name _ _) -> do putStrLn $ "Tree , NORMALIZED:"++show name putStrLn$ show$ displayDefaultTree$ deAnnotate $ liftFT (normalize . annotateWLabLists) ft exitSuccess ------------------------------------------------------------ when (elem PrintConsensus opts) $ case (num_taxa config) of Expected numtax -> do (_,fts) <- parseNewickFiles (name_hack config) all_inputs putStrLn $ "Strict Consensus Tree of "++show (length fts)++" trees:" when (null fts) $ error "No trees provided!" let ctree = consensusTree numtax (map nwtree fts) FullTree{labelTable} = head fts print$ displayStrippedTree$ FullTree "" labelTable ctree exitSuccess _ -> error "--consensus: cannot compute consensus when number of taxa is unknown or variable" ------------------------------------------------------------ when (elem PrintMatching opts) $ do (labs,fts) <- parseNewickFiles (name_hack config) all_inputs when (null$ highlights config) $ error "No --highlight given, so --matching makes no sense. Matching what?" htrs <- retrieveHighlights (name_hack config) labs (highlights config) let isMatch = matchAnyHighlight htrs forM_ fts $ \ FullTree{treename,nwtree} -> when (isMatch$ fmap (const()) nwtree) $ putStrLn treename exitSuccess ------------------------------------------------------------ when (View `elem` opts) $ do view_graphs config exitSuccess ------------------------------------------------------------ -- Otherwise do the main, normal thing: driver config -- | Completely optional visualization step. If this fails, we don't -- sweat it. view_graphs :: PhyBinConfig -> IO () view_graphs PBC{..} = catch act hndl where hndl :: SomeException -> IO () hndl exn = putStrLn$ "Warning: Viewing graph failed, ignoring. Error was: "++show exn act = do chans <- forM inputs $ \ file -> do putStrLn$ "Drawing "++ file ++"...\n" str <- B.readFile file putStrLn$ "Parsed: " ++ (B.unpack str) let (tbl,tr) = parseNewick M.empty name_hack file str (chan, _tr) <- viewNewickTree file (FullTree "" tbl (annotateWLabLists (head tr))) return chan forM_ chans readChan return () -------------------------------------------------------------------------------- -- Every dataset it seems needs a new hack on the names! name_table_reader :: String -> IO (String -> String) name_table_reader file = do contents <- readFile file let mp = M.fromList $ map (\ls -> case ls of [a,b] -> (a,b) _ -> error$ "Each line of "++file++" must contain two whitespace free strings: "++ unwords ls) $ filter (not . null) $ map tokenize $ lines contents return$ \ name_to_hack -> case M.lookup name_to_hack mp of -- Could use a trie Just x -> x Nothing -> name_to_hack where tokenize :: String -> [String] tokenize line = case words line of [] -> [] [one] -> error$"Name table contained bad line: "++ show one [one,two] -> [one,two] (one:rest) -> [one, unwords rest] temp :: IO () temp = driver default_phybin_config{ num_taxa= Expected 7, inputs=["../datasets/test.tr"] } -------------------------------------------------------------------------------- -- Aggregated Unit Tests -------------------------------------------------------------------------------- allUnitTests :: Test -- allUnitTests = unitTests ++ allUnitTests = test [ Bio.Phylogeny.PhyBin.unitTests , Bio.Phylogeny.PhyBin.Parser.unitTests , "norm/Bip1" ~: (testNorm prob1) , "bipTreeConversion" ~: testBipConversion , "t3" ~: t3_consensusTest, "t4" ~: t4_consensusTest, "t5" ~: t5_consensusTest ] -- [2013.07.23] -- This was INCORRECTLY normalizing to: -- ((1_, 2_), (7_, (18, 6_)), ((14, 3_), (19, (13, 5_)))) prob1 :: String prob1 = "(5_, (19, ((3_, 14), ((2_, 1_), (7_, (6_, 18))))), 13);" -- | Make sure that the normalized version of a tree yields the same bipartitions as -- the unnormalized one. testNorm :: String -> IO () testNorm str = do let (labs,parseds) = parseNewick M.empty id "test" (B.pack str) parsed = head parseds normed = normalize $ annotateWLabLists parsed bips1 = allBips parsed bips2 = allBips normed added = S.difference bips2 bips1 removed = S.difference bips1 bips2 -- dispBips bip = show$ -- map (map (labs M.!)) $ -- map IS.toList$ S.toList bip unless (bips1 == bips2) $ do putStrLn$ "Normalized this: "++show (displayDefaultTree $ FullTree "" labs parsed) putStrLn$ "To this : "++show (displayDefaultTree $ deAnnotate $ FullTree "" labs normed) error$ "Normalization added and removed these bipartitions, respectively:\n " ++concat (intersperse " " (map (dispBip labs) (S.toList added))) ++"\n " ++concat (intersperse " " (map (dispBip labs) (S.toList removed))) -- 112 and 13 rftest :: IO () rftest = do (_mp,[t1,t2]) <- parseNewickFiles (take 2) ["tests/13.tr", "tests/112.tr"] putStrLn$ "Tree 13 : " ++ show (displayDefaultTree t1) putStrLn$ "Tree 112 : "++ show (displayDefaultTree t2) putStrLn$ "Tree 13 normed : "++ show (disp t1) putStrLn$ "Tree 112 normed : "++ show (disp t2) putStrLn$ "13 collapsed 0.02: " ++show (disp$ liftFT (collapseBranchLenThresh 0.02) t1) putStrLn$ "112 collapsed 0.02: " ++show (disp$ liftFT (collapseBranchLenThresh 0.02) t2) putStrLn$ "13 collapsed 0.03: " ++show (disp$ liftFT (collapseBranchLenThresh 0.03) t1) putStrLn$ "112 collapsed 0.03: " ++show (disp$ liftFT (collapseBranchLenThresh 0.03) t2) let (mat,_) = naiveDistMatrix [nwtree t1, nwtree t2] printDistMat stdout mat return () where disp (FullTree nm labs tr) = let collapsed :: AnnotatedTree collapsed = normalize$ annotateWLabLists tr in displayDefaultTree$ deAnnotate $ FullTree nm labs collapsed -- | This test was done with --editdist 4 --complete t3_consensusTest :: IO () t3_consensusTest = consensusTest "./tests/t3_consensus/cluster1_284_alltrees.tr" "./tests/t3_consensus/cluster1_284_consensus.tr" -- | This test was done with --editdist 0 --complete t4_consensusTest :: IO () t4_consensusTest = consensusTest "./tests/t4_consensus/cluster1_16_alltrees.tr" "./tests/t4_consensus/cluster1_16_consensus.tr" -- | This test was done with --editdist 1 --complete t5_consensusTest :: IO () t5_consensusTest = consensusTest "./tests/t5_consensus/cluster1_35_alltrees.tr" "./tests/t5_consensus/cluster1_35_consensus.tr" consensusTest :: String -> String -> IO () consensusTest alltrees consensus = do (_,ctree:ftrees) <- parseNewickFiles id [consensus,alltrees] let num_taxa = numLeaves (nwtree ctree) plainTrs = map nwtree ftrees eachbips = map allBips plainTrs totalBips = foldl' S.union S.empty eachbips intersectBips = foldl' S.intersection S.empty eachbips FullTree _ labs _ = ctree linesPrnt x = unlines (map ((" "++) . dispBip labs) $ S.toList x) putStrLn$ "Bips in each: "++ show (map S.size eachbips) putStrLn$ "Total bips in all: "++show (S.size totalBips) putStrLn$ "Bips in common: "++ show (S.size intersectBips) putStrLn$ "Bips of first member:\n" ++ linesPrnt (head eachbips) putStrLn$ "Some bips in the union that are NOT in the intersection:\n" ++ linesPrnt (S.fromList$ take 20$ S.toList$ S.difference totalBips intersectBips) let cbips = allBips $ nwtree ctree putStrLn$ "ConsensusBips ("++show (S.size cbips)++"):\n"++linesPrnt cbips putStrLn$"Things in the consensus that should NOT be:\n"++linesPrnt (S.difference cbips intersectBips) putStrLn$"Things not in the consensus that SHOULD be:\n"++linesPrnt (S.difference intersectBips cbips) putStrLn$ "Now recomputing consensus tree for "++show num_taxa++" taxa" let ctree2 = consensusTree num_taxa plainTrs cbips2 = allBips ctree2 putStrLn$ "Freshly recomputed consensusBips ("++show (S.size cbips2)++"):\n"++linesPrnt cbips2 HU.assertEqual "Consensus tree on disk should match computed one:" cbips cbips2 -- (allBips$ fmap (const ()) $ nwtree ctree) putStrLn " Partial distance matrix WITHIN this cluster:" let (mat,_) = naiveDistMatrix (map nwtree ftrees) printDistMat stdout (V.take 30 mat) HU.assertBool "Consensus should only include bicbips2ps in the members" (S.isSubsetOf cbips totalBips) HU.assertEqual "Consensus tree matches intersected bips" cbips intersectBips return () testBipConversion :: IO () testBipConversion = do (_,trs) <- allTestTrees mapM_ testOne trs putStrLn "All trees passed bip conversion." where testOne (FullTree{nwtree}) = do let sz = numLeaves nwtree bips = allBips nwtree tr2 = bipsToTree sz bips bips2 = allBips tr2 assertEqual "round trip bips->tree->bips" bips bips2 -- | Read in all test trees which we happen to have put in the repository for testing -- purposes. allTestTrees :: IO (LabelTable, [FullTree DefDecor]) allTestTrees = parseNewickFiles id $ [ "./tests/t3_consensus/cluster1_284_alltrees.tr" , "./tests/t3_consensus/cluster1_284_consensus.tr" , "./tests/t4_consensus/cluster1_16_alltrees.tr" , "./tests/t4_consensus/cluster1_16_consensus.tr" , "./tests/t5_consensus/cluster1_35_alltrees.tr" , "./tests/t5_consensus/cluster1_35_consensus.tr" ] ---------------------------------------------------------------------------------------------------- -- TODO: expose a command line argument for testing. -- The below test exposed my normalization bug relating to "deriving Ord". -- I need to transform it into one or more proper unit tests. main_test :: IO () main_test = withArgs ["-w","~/newton_and_newton_local/datasets/yersinia/yersinia_trees/111.dnd","-m","../datasets/yersinia/name_table_hack_yersinia.txt"] main -- [2013.07.22] Disabling for the new Label representation: {- pa :: NewickTree DefDecor pa = set_dec (Nothing,1) $ NTInterior () [NTInterior () [NTLeaf () "RE",NTInterior () [NTLeaf () "SD",NTLeaf () "SM"]],NTInterior () [NTLeaf () "BB",NTLeaf () "BJ"],NTInterior () [NTLeaf () "MB",NTLeaf () "ML"]] pb :: NewickTree DefDecor pb = set_dec (Nothing,1) $ NTInterior () [NTInterior () [NTLeaf () "BB",NTLeaf () "BJ"],NTInterior () [NTLeaf () "MB",NTLeaf () "ML"],NTInterior () [NTLeaf () "RE",NTInterior () [NTLeaf () "SD",NTLeaf () "SM"]]] ls1 :: [(String, NewickTree DefDecor)] ls1 = [("a",pa),("b",pb)] -- This is one: num_binned :: Int num_binned = M.size $ binthem ls1 a_ :: (String, NewickTree DefDecor) a_ = ("980.dnd", fmap (\x -> (Nothing,x)) $ NTInterior 0.0 [NTInterior 5.697e-2 [NTLeaf 3.95e-2 "SM",NTLeaf 5.977e-2 "SD"],NTLeaf 0.13143 "RE",NTInterior 0.13899 [NTInterior 9.019e-2 [NTLeaf 0.11856 "BB",NTLeaf 0.13592 "BJ"],NTInterior 0.13194 [NTLeaf 0.19456 "MB",NTLeaf 0.16603 "ML"]]]) b_ :: (String, NewickTree DefDecor) b_ = ("999.dnd", fmap (\x -> (Nothing,x)) $ NTInterior 0.0 [NTInterior 6.527e-2 [NTInterior 0.13734 [NTLeaf 2.975e-2 "SM",NTLeaf 3.002e-2 "SD"],NTLeaf 0.18443 "RE"],NTInterior 6.621e-2 [NTLeaf 0.16184 "MB",NTLeaf 0.15233 "ML"],NTInterior 0.23143 [NTLeaf 9.192e-2 "BB",NTLeaf 0.10125 "BJ"]]) -- But THIS is two: ack! num2 :: Int num2 = M.size $ binthem [a_,b_] -- Here's the test that breaks things: a_norm :: NewickTree StandardDecor a_norm = normalize (annotateWLabLists$ snd a_) b_norm_ :: NewickTree (Double, Int, [Label]) b_norm_ = NTInterior (0.0,7,["BB","BJ","MB","ML","RE","SD","SM"]) [NTInterior (0.23143,2,["BB","BJ"]) [NTLeaf (9.192e-2,1,["BB"]) "BB",NTLeaf (0.10125,1,["BJ"]) "BJ"],NTInterior (6.621e-2,2,["MB","ML"]) [NTLeaf (0.16184,1,["MB"]) "MB",NTLeaf (0.15233,1,["ML"]) "ML"],NTInterior (6.527e-2,3,["RE","SD","SM"]) [NTLeaf (0.18443,1,["RE"]) "RE",NTInterior (0.13734,2,["SD","SM"]) [NTLeaf (3.002e-2,1,["SD"]) "SD",NTLeaf (2.975e-2,1,["SM"]) "SM"]]] b_norm :: NewickTree StandardDecor b_norm = fmap (\ (bl,w,ls) -> StandardDecor bl Nothing w ls) b_norm_ d1 :: IO (Chan (), NewickTree StandardDecor) d1 = viewNewickTree "" a_norm d2 :: IO (Chan (), NewickTree StandardDecor) d2 = viewNewickTree "" b_norm d1_ :: IO ThreadId d1_ = forkIO $ do runGraphvizCanvas Dot (dotNewickTree_debug "" a_norm) Xlib; return () d2_ :: IO ThreadId d2_ = forkIO $ do runGraphvizCanvas Dot (dotNewickTree_debug "" b_norm) Xlib; return () -- | A of a tree with _____ weights attached to it: withBootstrap :: String withBootstrap = "((((A8F330_:0.01131438136322714984,(G0GWK2_:0.00568050636963043226,(Q92FV4_:0.00284163304504484121,((B0BVQ5_:0.00319487112504297311,A8GU65_:0.00000122123005994819)74:0.00279881991324161267,(C3PM27_:0.00560787769333294297,C4K2Z0_:0.00559642713265556899)15:0.00000122123005994819)4:0.00000122123005994819)56:0.00276851661606284868)60:0.00283144414216590342)76:0.00886304965525876697,(A8GQC0_:0.05449879836105625541,(A8F0B2_:0.04736199885985507840,Q4UJN9_:0.02648399728559588939)64:0.00905997055810744446)28:0.00323255855543533657)29:0.02237505187863457132,(Q1RGK5_:0.00000122123005994819,A8GYD7_:0.00000122123005994819)100:0.28299884298270094884)100:0.05776841634437222123,(Q9ZC84_:0.00000122123005994819,D5AYH5_:0.00000122123005994819)99:0.00951976341375833368,Q68VM9_:0.04408933524904214141);" withBootstrap2 :: String withBootstrap2 = "((((A8F330_:0.01131438136322714984,(G0GWK2_:0.00568050636963043226,(Q92FV4_:0.00284163304504484121,((B0BVQ5_:0.00319487112504297311,A8GU65_:0.00000122123005994819):0.00279881991324161267[74],(C3PM27_:0.00560787769333294297,C4K2Z0_:0.00559642713265556899):0.00000122123005994819[15]):0.00000122123005994819[4]):0.00276851661606284868[56]):0.00283144414216590342[60]):0.00886304965525876697[76],(A8GQC0_:0.05449879836105625541,(A8F0B2_:0.04736199885985507840,Q4UJN9_:0.02648399728559588939):0.00905997055810744446[64]):0.00323255855543533657[28]):0.02237505187863457132[29],(Q1RGK5_:0.00000122123005994819,A8GYD7_:0.00000122123005994819):0.28299884298270094884[100]):0.05776841634437222123[100],(Q9ZC84_:0.00000122123005994819,D5AYH5_:0.00000122123005994819):0.00951976341375833368[99],Q68VM9_:0.04408933524904214141);" -} PhyBin-0.3/Makefile000066400000000000000000000077621221636131000142140ustar00rootroot00000000000000 default: haskell all: find_tree_matches.exe haskell # GHC=ghc-7.4.2 GHC=ghc-7.6.3 release: haskell upx phybin.exe du -h phybin.exe haskell: cabal configure cabal build cp ./dist/build/phybin/phybin phybin.exe du -h phybin.exe strip phybin.exe du -h phybin.exe win: $(GHC) -DWIN32 --make Main.hs -o phybin.exe find_tree_matches.exe: find_tree_matches.ss mzc --exe find_tree_matches.exe find_tree_matches.ss clean: rm -f phybin *.exe *.hi *.o Bio/Phylogeny/PhyBin/*.o Bio/Phylogeny/PhyBin/*.hi ################################################################################ #launch: clean release doc distro push upload launch: doc distro push upload doc: pandoc -s -S --toc -c website.css README.md -o website/index.html release_website: cp website/* ~/.hyplan/projects/phybin/ distro: cabal sdist cp -v dist/*.tar.gz website/ # HASKELLSRV=haskell.intel HASKELLSRV=community.haskell.org ################################################################################ # Collecting shortcuts for running different datasets: # LOCALSETS=~/newton_and_newton_local/datasets LOCALSETS=. rhizobia: rm -rf $(LOCALSETS)/rhizobia/phybin_outputs mkdir $(LOCALSETS)/rhizobia/phybin_outputs ./phybin.exe $(LOCALSETS)/rhizobia/gde/*.dnd -o $(LOCALSETS)/rhizobia/phybin_output/ -p 2 -n 7 -g -v #yersinia_test_names: # ./phybin.exe -w ~/newton_and_newton_local/datasets/yersinia/yersinia_trees/111.dnd -s . -m ../datasets/yersinia/name_table_hack_yersinia.txt yersinia: rm -rf $(LOCALSETS)/yersinia/phybin_outputs mkdir $(LOCALSETS)/yersinia/phybin_outputs ./phybin.exe $(LOCALSETS)/yersinia/yersinia_trees/ -o $(LOCALSETS)/yersinia/phybin_outputs/ -m ../datasets/yersinia/name_table_hack_yersinia.txt -s . -n 9 -g legionella: rm -rf $(LOCALSETS)/legionella/phybin_outputs mkdir $(LOCALSETS)/legionella/phybin_outputs ./phybin.exe $(LOCALSETS)/legionella/legionella_orthologs_aa/ -o $(LOCALSETS)/legionella/phybin_outputs/ -m ../datasets/legionella/name_table_hack_legionella.txt -s 0123456789 -n 4 -g # Newer ones [2012.11.19]: newbatch: rickettsia rickettsiales wolbachia # BRANCHTHRESH=0.01 BRANCHTHRESH=0.05 PHYBIN= ./phybin.exe -b $(BRANCHTHRESH) -g #------------------------------------------------------------ rickettsia: $(LOCALSETS)/Rickettsia/renaming_table.txt rm -rf $(LOCALSETS)/Rickettsia/phybin_outputs mkdir $(LOCALSETS)/Rickettsia/phybin_outputs $(PHYBIN) -n 15 -m $(LOCALSETS)/Rickettsia/renaming_table.txt -s '_' -o $(LOCALSETS)/Rickettsia/phybin_output/ $(LOCALSETS)/Rickettsia/final_trees/*BranchLab*.out $(LOCALSETS)/Rickettsia/renaming_table.txt: $(LOCALSETS)/Rickettsia/Rickettsia_orthololgs.txt runghc stripTable.hs $^ > $@ #------------------------------------------------------------ rickettsiales: $(LOCALSETS)/Rickettsiales/renaming_table.txt rm -rf $(LOCALSETS)/Rickettsiales/phybin_outputs mkdir $(LOCALSETS)/Rickettsiales/phybin_outputs $(PHYBIN) -n 29 -m $(LOCALSETS)/Rickettsiales/renaming_table.txt -s '_' -o $(LOCALSETS)/Rickettsiales/phybin_output/ $(LOCALSETS)/Rickettsiales/final_trees/*BranchLab*.out $(LOCALSETS)/Rickettsiales/renaming_table.txt: $(LOCALSETS)/Rickettsiales/Rickettsiales_orthologs.txt runghc stripTable.hs $^ > $@ #------------------------------------------------------------ wolbachia: $(LOCALSETS)/Wolbachia/renaming_table.txt rm -rf $(LOCALSETS)/Wolbachia/phybin_outputs mkdir $(LOCALSETS)/Wolbachia/phybin_outputs $(PHYBIN) -n 4 -m $(LOCALSETS)/Wolbachia/renaming_table.txt -s '_' -o $(LOCALSETS)/Wolbachia/phybin_output/ $(LOCALSETS)/Wolbachia/final_trees/*BranchLab*.out $(LOCALSETS)/Wolbachia/renaming_table.txt: $(LOCALSETS)/Wolbachia/Wolbachia_orthologs.txt runghc stripTable.hs $^ > $@ vary_branchlen: $(LOCALSETS)/Rickettsia/renaming_table.txt $(LOCALSETS)/Rickettsiales/renaming_table.txt $(LOCALSETS)/Wolbachia/renaming_table.txt ./vary_branchlen.sh 15 Rickettsia ./vary_branchlen.sh 29 Rickettsiales ./vary_branchlen.sh 4 Wolbachia temp: ./phybin.exe ~/newton_and_newton_local/datasets/rhizobia/gde/*.dnd -o temp -p 2 -n 7 PhyBin-0.3/README.md000066400000000000000000000174361221636131000140320ustar00rootroot00000000000000% PhyBin (0.3): Binning/Clustering Newick Trees by Topology PhyBin is a simple command line tool that classifies a set of [Newick tree files](http://en.wikipedia.org/wiki/Newick_format) by their topology. The purpose of it is to take a large set of tree files and browse through the most common tree topologies. ![(Above figure) Trees corresponding to the three largest bins resulting from a phybin run. The file `binXX_YYY`, where `XX` is the rank of the bin and `YYY` is the number of trees having that topology.](trees.jpg) Change Log ========== In version 0.2, PhyBin was extended to do clustering as well as binning: * Computee full all-to-all Robinson-Foulds distance matrices (quickly) * Hierarchical clustering of all trees into a tree-of-trees dendrogram based on Robinson Foulds symmetric (tree edit) distance. In version 0.3, PhyBin gained a number of new features * A `--tolerant` mode for computing RF distance matrices even for trees missing taxa. * A `--prune` option for "zooming in" on a specific set of taxa. * The `--minboostrap` option was added. Invoking PhyBin =============== PhyBin is a command-line program that produces output in the form of text files and pdfs, but to produce pdfs (to visualize trees) the [GraphViz program](http://www.graphviz.org/), including the `dot` command, must be installed on the machine. The following is a simple invocation of PhyBin: phybin --bin *.tree -o output_dir/ The input trees can be specified directly on the command-line, or, if the name of a directory is provided instead, all contained files are assumed to be trees in Newick format. PhyBin, at minimum, produces files of the form `output_dir/clusterXX_YY.tr`, one for each bin. If requested, it will also produce visual representations of each bin in the form `output_dir/clusterXX_YY.pdf`. Downloading and Installing PhyBin ================================= The source code to PhyBin can be downloaded here: * [Download Source from Hackage](http://hackage.haskell.org/package/phybin) PhyBin is written in Haskell and if you have [Haskell Platform](http://hackage.haskell.org/platform/). installed you can install phybin with this one-liner: cabal install phybin Otherwise PhyBin is also available for download as a statically-linked executable for Mac-OS, Linux, and Windows: * [Download Mac-OS Binary](phybin-0.2.11.mac) * [Download Linux Binary](phybin-0.2.11.x86_64_linux) * [Download Windows Binary](phybin-0.2.11_windows.exe) Command-line Options ==================== In addition to input files and directories, `phybin` supports a number of command-line options. Run "phybin --help" to see these options. Here is a snapshot of the current help output (version 0.2.11): Usage: phybin [OPTION...] files or directories... PhyBin takes Newick tree files as input. Paths of Newick files can be passed directly on the command line. Or, if directories are provided, all files in those directories will be read. Taxa are named based on the files containing them. If a file contains multiple trees, all are read by phybin, and the taxa name then includes a suffix indicating the position in the file: e.g. FILENAME_0, FILENAME_1, etc. When clustering trees, Phybin computes a complete all-to-all Robinson-Foulds distance matrix. If a threshold distance (tree edit distance) is given, then a flat set of clusters will be produced in files clusterXX_YY.tr. Otherwise it produces a full dendogram (UNFINISHED). Binning mode provides an especially quick-and-dirty form of clustering. When running with the --bin option, only exactly equal trees are put in the same cluster. Tree pre-processing still applies, however: for example collapsing short branches. USAGE NOTES: * Currently phybin ignores input trees with the wrong number of taxa. * If given a directory as input phybin will assume all contained files are Newick trees. Options include: -v --verbose print WARNINGS and other information (recommended at first) -V --version show version number -o DIR --output=DIR set directory to contain all output files (default "./phybin_out/") --selftest run internal unit tests ----------------------------- Clustering Options ------------------------------ --bin Use simple binning, the cheapest form of 'clustering' --single Use single-linkage clustering (nearest neighbor) --complete Use complete-linkage clustering (furthest neighbor) --UPGMA Use Unweighted Pair Group Method (average linkage) - DEFAULT mode --editdist=DIST Combine all clusters separated by DIST or less. Report a flat list of clusters. Irrespective of whether this is activated, a hierarchical clustering (dendogram.pdf) is produced. Select Robinson-Foulds (symmetric difference) distance algorithm: --simple use direct all-to-all comparison --hashrf (default) use a variant of the HashRF algorithm for the distance matrix ----------------------------- Visualization -------------------------------- -g --graphbins use graphviz to produce .dot and .pdf output files -d --drawbins like -g, but open GUI windows to show each bin's tree -w --view for convenience, "view mode" simply displays input Newick files without binning --showtrees Print (textual) tree topology inside the nodes of the dendrogram --highlight=FILE Highlight nodes in the tree-of-trees (dendrogram) consistent with the. given tree file. Multiple highlights are permitted and use different colors. --interior Show the consensus trees for interior nodes in the dendogram, rather than just points. ---------------------------- Tree pre-processing ----------------------------- -n NUM --numtaxa=NUM expect NUM taxa for this dataset -b LEN --branchcut=LEN collapse branches less than LEN --------------------------- Extracting taxa names ---------------------------- -p NUM --nameprefix=NUM Leaf names in the input Newick trees can be gene names, not taxa. Then it is typical to extract taxa names from genes. This option extracts a prefix of NUM characters to serve as the taxa name. -s STR --namesep=STR An alternative to --nameprefix, STR provides a set of delimeter characters, for example '-' or '0123456789'. The taxa name is then a variable-length prefix of each gene name up to but not including any character in STR. -m FILE --namemap=FILE Even once prefixes are extracted it may be necessary to use a lookup table to compute taxa names, e.g. if multiple genes/plasmids map onto one taxa. This option specifies a text file with find/replace entries of the form " ", which are applied AFTER -s and -p. --------------------------- Utility Modes ---------------------------- --rfdist print a Robinson Foulds distance matrix for the input trees --setdiff for convenience, print the set difference between cluster*.txt files --print simply print out a concise form of each input tree --printnorms simply print out a concise and NORMALIZED form of each input tree --consensus print a strict consensus tree for the inputs, then exit --matching print a list of tree names that match any --highlight argument - - - - - - - - - - - - - - - Authors: Irene and Ryan Newton Contact email: `irnewton` and `rrnewton` at `indiana` `edu` (with "at" and "dot" inserted). [Irene's](http://www.bio.indiana.edu/faculty/directory/profile.php?person=irnewton) and [Ryan](http://www.cs.indiana.edu/~rrnewton/homepage.html) homepages. . PhyBin-0.3/Setup.hs000066400000000000000000000005011221636131000141700ustar00rootroot00000000000000#!/usr/bin/env runhaskell import Distribution.Simple import Distribution.Simple.PreProcess import Distribution.PackageDescription import Distribution.Simple.LocalBuildInfo import System.Cmd(system) import System.Exit main :: IO () main = do putStrLn$ "Running Setup.hs ..." defaultMainWithHooks simpleUserHooks PhyBin-0.3/TestAll.hs000066400000000000000000000024071221636131000144470ustar00rootroot00000000000000{-# LANGUAGE TemplateHaskell #-} {-# LANGUAGE ScopedTypeVariables, BangPatterns, ParallelListComp, OverloadedStrings #-} module Main where import Data.Vector as V import Data.Vector.Unboxed as U import Data.ByteString.Char8 as B import Bio.Phylogeny.PhyBin import Bio.Phylogeny.PhyBin.CoreTypes import Bio.Phylogeny.PhyBin.RFDistance import Bio.Phylogeny.PhyBin.Parser import System.IO import Prelude as P import qualified Test.HUnit as HU -- import Test.Framework import Test.Framework.Providers.HUnit import Test.Framework.TH (defaultMainGenerator) -- Here we compare a tree against the same tree with 18 removed: -- Or the same tree with one intermediate node removed. (_,t30) = parseNewicks id $ [ ("t30_0", "(2_, (((14, 3_), (19, (5_, 13))), (18, (6_, 7_))), 1_);") , ("t30_1", "(2_, (((14, 3_), (19, (5_, 13))), (6_, 7_)), 1_);") , ("t30_2", "(2_, (((14, 3_), (19, (5_, 13))), (18, 6_, 7_)), 1_);") ] t30' = naiveDistMatrix$ P.map nwtree t30 -- "[[],[1]]" case_t30 = HU.assertEqual "3-tree distance matrix" "[[],[0],[1,0]]" (showMat t30') -- m = printDistMat stdout (fst d) -- Simple show for a distance matrix: showMat (m,_) = show$ V.toList$ V.map U.toList m main = $(defaultMainGenerator) PhyBin-0.3/old_script/000077500000000000000000000000001221636131000147025ustar00rootroot00000000000000PhyBin-0.3/old_script/cheztrace.ss000066400000000000000000000032071221636131000172230ustar00rootroot00000000000000;; [2005.11.13 ] Having a go at implementing these in PLT: ;; DOESNT WORK YET. (module cheztrace mzscheme ; (require (lib "errortrace.ss" "errortrace") ; (lib "stacktrace.ss" "errortrace")) (require (lib "trace.ss")) (provide trace-lambda trace-define trace-let inspect break) ;; Just stubs. (define (inspect x) (void)) (define (break x) x) #; (define-syntax trace-lambda (lambda (x) (syntax-case x () [(_ params bods ...) #`(lambda params #,(annotate #'(begin bods ...) #f))]))) (define-syntax trace-lambda (lambda (x) (syntax-case x () [(_ name (params ...) bods ...) (identifier? #'name) #'(let ((name (lambda (params ...) bods ...))) (trace name) name)]))) #| (define-syntax trace-define (lambda (x) (syntax-case x () [(_ (id v ...) e) #'(begin (define (id v ...) e) (trace id))] [(_ x e) (identifier? #'x) #'(begin (define x e) (trace x))]))) (define-syntax trace-let (syntax-rules () [(_ n ([l* r*] ...) bods ...) (letrec ((n (lambda (l* ...) bods ...))) (trace n) (n r* ...))])) |# ;; [2006.03.23] For now killing these ^^^, not sure how much they worked. ;; These simply don't do anything under PLT for the moment. (define-syntax trace-define (syntax-rules () [(_ e ...) (define e ...)])) (define-syntax trace-let (syntax-rules () [(_ e ...) (let e ...)])) ) ;(require cheztrace) ;(define f (trace-lambda (x) x (if (odd? x) (error 'foo "") (add1 x)))) ;(f 40) ;(f 39) ;(trace-let loop ((n 10)) ; (if (> n 0) (loop (sub1 n)))) PhyBin-0.3/old_script/find_tree_matches.ss000066400000000000000000000331301221636131000207140ustar00rootroot00000000000000#! /bin/bash #| exec mzscheme -qu "$0" ${1+"$@"} |# ;;==================================================================================================== ;; This is a script to parse .dnd tree files. ;; NOTE: This was the "Version 1" script and has been replaced by the haskell version, phybin.hs ;;==================================================================================================== ;; [2009.12.06] Hacking it to work with wolbachia_A_group/input (module find_tree_matches mzscheme ;; Import the parser and lexer generators. (require (lib "yacc.ss" "parser-tools") (lib "lex.ss" "parser-tools") (prefix : (lib "lex-sre.ss" "parser-tools")) (lib "list.ss") ;(lib "iu-match.ss") "iu-match.ss" ) ; (provide (all-defined)) (require (lib "pretty.ss") (lib "process.ss") (prefix srfi: (lib "1.ss" "srfi"))) ;;==================================================================================================== ;; First define the PARSER ;;==================================================================================================== (define-empty-tokens op-tokens (LeftParen RightParen Comma SemiColon Colon EOF )) (define-tokens value-tokens (NAME NUM)) (define-lex-abbrevs (lower-letter (:/ #\a #\z)) (upper-letter (:/ #\A #\Z)) (alpha (:or lower-letter upper-letter)) (digit (:/ #\0 #\9)) (digits (:+ digit)) (letter (:or alpha digit "." "'" "-" "/")) ;; This requires that names have an underscore in them: ;; That's what distinguishes them from numbers: ;(name (:seq (:+ letter) (:+ (:seq "_" (:* letter))))) ;; [2010.08.13] I can't remember why I did that... were there gene names that started with numbers?? (name (:seq alpha (:+ (:or letter "_")))) ;; [2009.12.06] Odd, getting negative branch lengths!! ;; [2009.12.07] Ran into some trees with 0 distance and no decimal point: (number (:seq (:? "-") digits (:? (:seq "." digits (:? (:seq "E-" digits)))))) ;(number (:seq digit digits)) ) (define lex (lexer-src-pos [(eof) 'EOF] ;; Ignore all whitespace: [(:or #\tab #\space #\newline #\return) (return-without-pos (lex input-port))] ["(" 'LeftParen] [")" 'RightParen] ["," 'Comma] [";" 'SemiColon] [":" 'Colon] ;; Like most of these scripts this is a hack. It assumes all node ID's in the tree start with a particular string. ;[(:seq "c" name) (token-NAME lexeme)] [name (token-NAME lexeme)] [number (token-NUM (string->number lexeme))] ; [number (token-NUM lexeme)] )) (define (format-pos pos) (if (position-line pos) (format "line ~a:~a" (position-line pos) (position-col pos)) (format "char ~a" (position-offset pos)))) (define parse (parser (src-pos) (start wholefile) (end EOF) (tokens value-tokens op-tokens) (error (lambda (a b c start end) (fprintf (current-error-port) "PARSE ERROR: after ~a token, ~a~a.\n" (if a "valid" "invalid") b (if c (format " carrying value ~s" c) "")) (fprintf (current-error-port) " Located between ~a and ~a.\n" (format-pos start) (format-pos end)))) ;; Precedence: ;(precs ) ;(debug "_parser.log") ; Tree: The full input Newick Format for a single tree ; Subtree: an internal node (and its descendants) or a leaf node ; Leaf: a leaf node ; Internal: an internal node (and its descendants) ; BranchSet: a set of one or more Branches ; Branch: a tree edge and its descendant subtree. ; Name: the name of a node ; Length: the length of a tree edge. ; [edit]The grammar rules (grammar (wholefile [(Subtree SemiColon) $1] [(Branch SemiColon) $1]) (Subtree [(Leaf) $1] [(Internal) $1]) (Leaf [(Name) $1]) (Internal [(LeftParen BranchSet RightParen Name) (if $4 (cons $4 $2) $2)]) (BranchSet [(Branch) (list $1)] ;[(BranchSet Comma Branch) (cons $1 $3)] [(Branch Comma BranchSet) (cons $1 $3)] ) (Branch [(Subtree Length) (list $2 $1)]) (Name [() #f] [(NAME) $1]) (Length [() 0] [(Colon NUM) $2]) ) #; (grammar (wholefile [(tree SemiColon) $1] ;; ACK making semicolon and final tag optional!! [(tree) $1] [(LeftParen nodes+ RightParen) (cons 0.0 $2)] [(LeftParen nodes+ RightParen SemiColon) (cons 0.0 $2)] ) (numtag [(Colon NUM) $2] ;; [2010.08.13] Making them optional everywhere: [() 0.0] ) (tree [(NAME numtag) (list $2 $1)] [(LeftParen nodes+ RightParen numtag) (cons $4 $2)]) (nodes+ [(tree) (list $1)] [(tree Comma nodes+) (cons $1 $3)]) ) )) ;; ================================================================================ (define (parse-file f) (let ([p (open-input-file f)]) (let ([res (parse-port p)]) ;(printf "PARSED: \n")(pretty-print res) (close-input-port p) res))) (define (parse-port ip) (port-count-lines! ip) (parse (lambda () (let ((lexed (lex ip))) ;(printf "LEXED: ~s\n" (position-token-token lexed)) (flatten lexed)))) ) (define (flatten pt) (let loop ((x pt)) (if (position-token? (position-token-token x)) (begin (error 'flatten "No nested position-tokens!") (loop (position-token-token x))) x))) (define allargs (vector->list (current-command-line-arguments))) (define pretty? #t) ;; End main (print-graph #t) (print-vector-length #f) ;;==================================================================================================== ;; Now normalize the trees. ;;==================================================================================================== ;; The basic idea here is to sort branches by some criteria. ;; If that criteria turns out to be insufficient... then we can add extra criteria. (define (normalize-tree tr) ;; This returns a branch-sorted tree (define (loop tr ) (match tr [(,dist ,name) (guard (string? name)) (values (list dist name) 1 1)] [(,dist ,x) (error 'normalize-tree "was not expecting interior nodes with one sub-branch... ~s" (list dist x))] [(,dist ,[nodes sizes depths] ...) ;; This creates an ugly data type (list overuse) with both the branches and their size/depth measurement: (define sorted (sort (map list nodes sizes depths) (lambda (ls1 ls2) ;(printf "Comparing ~s and ~s\n" (cdr ls1) (cdr ls2)) (or (< (cadr ls1) (cadr ls2)) (and (= (cadr ls1) (cadr ls2)) (< (caddr ls1) (caddr ls2))))) )) ;(printf "\n ~a Node sizes ~s and depths ~s\n" dist sizes depths) ;; If there are any non-leaf "plateaus" in the ramp then we can't deal with that yet: (let loop ((last (car sorted)) (rest (cdr sorted))) (cond [(null? rest) (void)] ;; If they're equal according to the size/depth metric: [(and (equal? (cdr last) (cdar rest)) ;; And non-leaf: (not (equal? (cdr last) '(1 1)))) ;; Ok, given that we have a match, that's still ok if they are the SAME tree. ;; The only problem is if they're different in a way that our metric doesn't capture. (if (tree-shape-equals? (car last) (caar rest)) (void) (begin (printf "\nERRORful subbranches:\n") (pretty-print (car last)) (pretty-print (caar rest)) (error 'normalize-tree "Cannot yet handle same depth/size signatures of sub-branches: ~s ~s" (cdr last) (cdar rest))))] [else (loop (car rest) (cdr rest))])) (values (cons dist (map car sorted)) (apply + 1 sizes) (add1 (apply max depths)) )])) (define-values (tree _ __) (loop tr)) tree) ;; Assumes that the trees are already in normal form: ;; Compares tree shape and leaf nodes, IGNORES interior nodes. (define (tree-equals? tr1 tr2) ;(printf "EQUALS? ~s ~s\n" tr1 tr2) (match (cons tr1 tr2) [((,dist1 ,name1) . (,dist2 ,name2)) (guard (string? name1) (string? name2)) (if #t ;; -- This would give you only shape. (string=? name1 name2) ;; -- This requires that the leaf names be the same. ) ] [((,dist1 ,nodes1 ...) . (,dist2 ,nodes2 ...)) (and (equal? (length nodes1) (length nodes2)) (andmap tree-equals? nodes1 nodes2))] [(,s1 . ,s2) (guard (string? s1) (string? s2)) (error 'tree-equals? "implementation assumption violated... oops")] [,else #f])) (define (tree-shape-equals? tr1 tr2) ;(printf "\nSHAPE EQUALS? ~s ~s\n" tr1 tr2) (match (cons tr1 tr2) [((,dist1 ,name1) . (,dist2 ,name2)) (guard (string? name1) (string? name2)) #t ;; -- This would give you only shape. ] [((,dist1 ,nodes1 ...) . (,dist2 ,nodes2 ...)) (and (equal? (length nodes1) (length nodes2)) (andmap tree-shape-equals? nodes1 nodes2))] [(,s1 . ,s2) (guard (string? s1) (string? s2)) (error 'tree-shape-equals? "implementation assumption violated... oops")] [,else #f])) ;;==================================================================================================== ;; TESTS ;;==================================================================================================== (define tests '( "(,,(,));" ; no nodes are named "(A,B,(C,D));" ; leaf nodes are named "(A,B,(C,D)E)F;" ; all nodes are named "(:0.1,:0.2,(:0.3,:0.4):0.5);" ; all but root node have a distance to parent "(:0.1,:0.2,(:0.3,:0.4):0.5):0.0;" ; all have a distance to parent "(A:0.1,B:0.2,(C:0.3,D:0.4):0.5);" ; distances and leaf names (popular) "(A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;" ; distances and all names "((B:0.2,(C:0.3,D:0.4)E:0.5)F:0.1)A;" ; a tree rooted on a leaf node (rare) )) (define (test) (map parse tests)) ;;==================================================================================================== ;; Here's our script "main" ;;==================================================================================================== (define (new-main files) (define __ (begin (printf "\nBeginning processing of ~s files\n" (length files)))) ; (define parsed (parse-file file)) ; (define normalized (normalize-tree parsed)) ;(pretty-print parsed) ;(newline) ;(printf "NORMALIZED ~s\n " file) ;(pretty-print normalized) ;;(exit) (define ___ (begin (printf "Parsing and normalizing trees: ")(flush-output))) (define all-trees (map (lambda (file) ;(define __ (begin (printf " ~a " file)(flush-output))) (define parsed (parse-file file)) (define normalized (normalize-tree parsed)) (printf ".") (flush-output) (list file normalized)) files)) ;; Type: (((name tree) ...) ...) ;; Each equivalence class is represented by (tree name ...) ;; Only a single representative tree is needed. (define classes '()) (if (null? (cdr all-trees)) (begin ;; If we have one input just print it: (printf "\nOnly one file as input. Not making equivalence classes, just printing it:\n") (pretty-print (cadar all-trees)) ) (begin (printf "\nComputing equivalence classes on ~s trees: " (length all-trees)) (flush-output) (for-each (lambda (entry) (define found-hit #f) (match entry [(,file ,normtree) (let ([new (map (lambda (class) (if (tree-equals? normtree (car class)) (begin (set! found-hit #t) (cons (car class) (cons file (cdr class)))) class)) classes)]) (printf ".") (flush-output) (set! classes (if found-hit new (cons `(,normtree ,file) classes)))) ])) all-trees) (newline) ;; Now sort them: (set! classes (sort classes (lambda (a b) (< (length a) (length b))))) (printf "Finished. ~s files fell into ~s equivalence classes\n" (length files) (length classes)) (printf "Sizes of each equivalence class: ") (for-each (lambda (class) (printf " ~s" (length (cdr class)))) classes) (newline) (printf "Dumping lists of files to class.txt, and representative tree to class.tree...\n") (system "rm -f class*.txt class*.tree") (for-each (lambda (i class) ;(when (file-exists? (format "class~s.txt" i)) (delete-file (format "class~s.txt" i))) ;(when (file-exists? (format "class~s.tree" i)) (delete-file (format "class~s.tree" i))) (with-output-to-file (format "class~s.txt" i) (lambda () (for-each (lambda (file) (printf "~a " file)) (cdr class)) (newline))) (with-output-to-file (format "class~s.tree" i) (lambda () (pretty-print (car class)))) ) (srfi:iota (length classes)) classes) ))) (define (print-usage) (printf "Usage: find_tree_matches ...\n\n")) ;COUNTING: ((0 ((0 "lpg140") (0 "lpl135"))) (0 "lpp135") (0 "LPC_081")) (define (sumcount stuff) ;(printf "sumcount ~s\n" stuff) (if (or (string? stuff) (not stuff)) 0 (foldl (lambda (x acc) (+ acc (count x))) 0 stuff))) (define (count tree) ;(define __ (printf "COUNTING: ~s\n" tree)) (if (number? (car tree)) (+ 1 (sumcount (cadr tree))) (sumcount tree))) ;; BENCHMARK TEST: ;(for-each (lambda (file) (parse-file file)) allargs) (pretty-print (foldl + 0 (map (lambda (file) (count (parse-file file))) allargs))) ;(pretty-print (map (lambda (file) (pretty-print (count (parse-file file)))) allargs)) ;(pretty-print (count '((0 ((0 "lpg292") (0 "lpl285"))) (0 "lpp299") (0 "LPC_323")))) #; (begin (when (null? allargs) (print-usage) (flush-output) ;(error 'find_tree_matches "no files given") (printf "Error: no files given!\n") (exit 1)) (new-main allargs)) ) ;; End module. PhyBin-0.3/old_script/iu-exptime.ss000066400000000000000000000045711221636131000173460ustar00rootroot00000000000000;;; match.ss ;;; (broken out for PLT port only) ;;; This program was originally designed and implemented by Dan ;;; Friedman. It was redesigned and implemented by Erik Hilsdale; ;;; some improvements were suggested by Steve Ganz. Additional ;;; modifications were made by Kent Dybvig. Original port to PLT Scheme ;;; by Matthew Flatt. As of 20020328, Matt Jadud is maintaining ;;; the PLT Scheme port of this code. ;;; Copyright (c) 1992-2002 Cadence Research Systems ;;; Permission to copy this software, in whole or in part, to use this ;;; software for any lawful purpose, and to redistribute this software ;;; is granted subject to the restriction that all copies made of this ;;; software must include this copyright notice in full. This software ;;; is provided AS IS, with NO WARRANTY, EITHER EXPRESS OR IMPLIED, ;;; INCLUDING BUT NOT LIMITED TO IMPLIED WARRANTIES OF MERCHANTABILITY ;;; OR FITNESS FOR ANY PARTICULAR PURPOSE. IN NO EVENT SHALL THE ;;; AUTHORS BE LIABLE FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES OF ANY ;;; NATURE WHATSOEVER. ;;; If you are reading this comment, then you do not have ;;; the most recent version of 'match'. (module iu-exptime mzscheme (provide syntax-error let-synvalues* with-values syntax-lambda with-temp with-temps) (define (syntax-error e m) (raise-syntax-error #f m e)) (define-syntax let-synvalues* (syntax-rules () ((_ () B0 B ...) (begin B0 B ...)) ((_ (((Formal ...) Exp) Decl ...) B0 B ...) (call-with-values (lambda () Exp) (lambda (Formal ...) (with-syntax ((Formal Formal) ...) (let-synvalues* (Decl ...) B0 B ...))))))) (define-syntax with-values (syntax-rules () ((_ P C) (call-with-values (lambda () P) C)))) (define-syntax syntax-lambda (lambda (x) (syntax-case x () ((_ (Pat ...) Body0 Body ...) (with-syntax (((X ...) (generate-temporaries #'(Pat ...)))) #'(lambda (X ...) (with-syntax ((Pat X) ...) Body0 Body ...))))))) (define-syntax with-temp (syntax-rules () ((_ V Body0 Body ...) (with-syntax (((V) (generate-temporaries '(x)))) Body0 Body ...)))) (define-syntax with-temps (syntax-rules () ((_ (V ...) (Exp ...) Body0 Body ...) (with-syntax (((V ...) (generate-temporaries #'(Exp ...)))) Body0 Body ...)))) ) PhyBin-0.3/old_script/iu-match.ss000066400000000000000000000543101221636131000167630ustar00rootroot00000000000000;;; match.ss ;;; This program was originally designed and implemented by Dan ;;; Friedman. It was redesigned and implemented by Erik Hilsdale; ;;; some improvements were suggested by Steve Ganz. Additional ;;; modifications were made by Kent Dybvig. Original port to PLT Scheme ;;; by Matthew Flatt. As of 20020328, Matt Jadud is maintaining ;;; the PLT Scheme port of this code. ;;; Copyright (c) 1992-2002 Cadence Research Systems ;;; Permission to copy this software, in whole or in part, to use this ;;; software for any lawful purpose, and to redistribute this software ;;; is granted subject to the restriction that all copies made of this ;;; software must include this copyright notice in full. This software ;;; is provided AS IS, with NO WARRANTY, EITHER EXPRESS OR IMPLIED, ;;; INCLUDING BUT NOT LIMITED TO IMPLIED WARRANTIES OF MERCHANTABILITY ;;; OR FITNESS FOR ANY PARTICULAR PURPOSE. IN NO EVENT SHALL THE ;;; AUTHORS BE LIABLE FOR CONSEQUENTIAL OR INCIDENTAL DAMAGES OF ANY ;;; NATURE WHATSOEVER. ;;; A complete CHANGELOG and some documentation can be found at the ;;; end of this file. ;;; If you are reading this comment, then you do not have ;;; the most recent version of 'match'. ;; ============================================================= ;; Exp ::= (match Exp Clause) ;; || (trace-match Exp Clause) ;; || (match+ (Id*) Exp Clause*) ;; || (trace-match+ (Id*) Exp Clause*) ;; || OtherSchemeExp ;; Clause ::= (Pat Exp+) || (Pat (guard Exp*) Exp+) ;; Pat ::= (Pat ... . Pat) ;; || (Pat . Pat) ;; || () ;; || #(Pat* Pat ... Pat*) ;; || #(Pat*) ;; || ,Id ;; || ,[Id*] ;; || ,[Cata -> Id*] ;; || Id ;; Cata ::= Exp ;; YOU'RE NOT ALLOWED TO REFER TO CATA VARS IN GUARDS. (reasonable!) #cs (module iu-match mzscheme (provide match+ trace-match+ match trace-match ) (require-for-syntax "iu-exptime.ss" (lib "stx.ss" "syntax")) (require (lib "list.ss") (lib "etc.ss") ) ;; RRN: [2005.11.13] (require "cheztrace.ss") (define (make-list n v) (vector->list (make-vector n v))) (define match-equality-test (make-parameter equal? (lambda (x) (unless (procedure? x) (error 'match-equality-test "~s is not a procedure" x)) x))) (define-syntax match+ (lambda (x) (syntax-case x () [(_ (ThreadedId ...) Exp Clause ...) #'(let f ((ThreadedId ThreadedId) ... (x Exp)) (match-help _ f x (ThreadedId ...) Clause ...))]))) (define-syntax match (lambda (x) (syntax-case x () [(_ Exp Clause ...) #'(let f ((x Exp)) (match-help _ f x () Clause ...))]))) (define-syntax trace-match+ (lambda (x) (syntax-case x () [(_ (ThreadedId ...) Name Exp Clause ...) #'(letrec ((f (trace-lambda Name (ThreadedId ... x) (match-help _ f x (ThreadedId ...) Clause ...)))) (f ThreadedId ... x))]))) (define-syntax trace-match (lambda (x) (syntax-case x () [(_ Name Exp Clause ...) #'(letrec ((f (trace-lambda Name (x) (match-help _ f x () Clause ...)))) (f Exp))]))) ;;; ------------------------------ (define-syntax let-values** (syntax-rules () ((_ () B0 B ...) (begin B0 B ...)) ((_ ((Formals Exp) Rest ...) B0 B ...) (let-values** (Rest ...) (call-with-values (lambda () Exp) (lambda Formals B0 B ...)))))) (define-syntax match-help (lambda (x) (syntax-case x () ((_ Template Cata Obj ThreadedIds) #'(error 'match "Unmatched datum: ~s" Obj)) ((_ Template Cata Obj ThreadedIds (Pat B0 B ...) Rest ...) #'(convert-pat Pat (match-help1 Template Cata Obj ThreadedIds (B0 B ...) Rest ...)))))) (define-syntax match-help1 (syntax-rules (guard) ((_ PatLit Vars Cdecls Template Cata Obj ThreadedIds ((guard G ...) B0 B ...) Rest ...) (let ((ls/false (sexp-dispatch Obj PatLit))) (if (and ls/false (apply (lambda Vars (guard-body Cdecls (extend-backquote Template (and G ...)))) ls/false)) (apply (lambda Vars (clause-body Cata Cdecls ThreadedIds (extend-backquote Template B0 B ...))) ls/false) (match-help Template Cata Obj ThreadedIds Rest ...)))) ((_ PatLit Vars Cdecls Template Cata Obj ThreadedIds (B0 B ...) Rest ...) (let ((ls/false (sexp-dispatch Obj PatLit))) (if ls/false (apply (lambda Vars (clause-body Cata Cdecls ThreadedIds (extend-backquote Template B0 B ...))) ls/false) (match-help Template Cata Obj ThreadedIds Rest ...)))))) (define-syntax clause-body (lambda (x) (define build-mapper (lambda (vars depth cata tIds) (if (zero? depth) cata (with-syntax ((rest (build-mapper vars (- depth 1) cata tIds)) (vars vars) (tIds tIds)) #'(mapper rest vars tIds))))) (syntax-case x () ((_ Cata ((CVar CDepth CMyCata CFormal ...) ...) (ThreadedId ...) B) (with-syntax (((Mapper ...) (map (lambda (mycata formals depth) (build-mapper formals (syntax-object->datum depth) (syntax-case mycata () [#f #'Cata] [exp #'exp]) #'(ThreadedId ...))) (syntax->list #'(CMyCata ...)) (syntax->list #'((CFormal ...) ...)) (syntax->list #'(CDepth ...)))) (((ThreadedId ...) ...) (map (lambda (x) #'(ThreadedId ...)) (syntax->list #'((CFormal ...) ...))))) #'(let-values** (([ThreadedId ... CFormal ...] (Mapper ThreadedId ... CVar)) ...) B)))))) (define-syntax guard-body (lambda (x) (syntax-case x () ((_ ((Cvar Cdepth MyCata Cformal ...) ...) B) (with-syntax (((CF ...) (apply append (map syntax->list (syntax->list #'((Cformal ...) ...)))))) #'(let-syntax ((CF (lambda (x) (syntax-case x () (Name (syntax-error #'Name "guard cannot refer to return-value variable"))))) ...) B)))))) (define-syntax convert-pat ;; returns sexp-pat x vars x cdecls (let () (define ellipsis? (lambda (x) (and (identifier? x) (eq? (syntax-e x) '...)))) (define Var? (lambda (x) (syntax-case x (->) [-> #f] [id (identifier? #'id)]))) (define (f syn vars cdecls depth) (syntax-case syn (unquote) ((unquote . stuff) ; separate for better error detection (syntax-case syn (unquote ->) ((unquote [MyCata -> Var ...]) (andmap Var? (syntax->list #'(Var ...))) (with-syntax (((Temp) (generate-temporaries '(x))) (Depth depth)) (values #'any (cons #'Temp vars) (cons #'[Temp Depth MyCata Var ...] cdecls)))) ((unquote [Var ...]) (andmap Var? (syntax->list #'(Var ...))) (with-syntax (((Temp) (generate-temporaries '(x))) (Depth depth)) (values #'any (cons #'Temp vars) (cons #'[Temp Depth #f Var ...] cdecls)))) ((unquote Var) (Var? #'Var) (values #'any (cons #'Var vars) cdecls)))) (((unquote . stuff) Dots) ; separate for better error detection (ellipsis? #'Dots) (syntax-case syn (unquote ->) (((unquote [MyCata -> Var ...]) Dots) (andmap Var? (syntax->list #'(Var ...))) (with-syntax (((Temp) (generate-temporaries '(x))) (Depth+1 (add1 depth))) (values #'each-any (cons #'Temp vars) (cons #'[Temp Depth+1 MyCata Var ...] cdecls)))) (((unquote [Var ...]) Dots) (andmap Var? (syntax->list #'(Var ...))) (with-syntax (((Temp) (generate-temporaries '(x))) (Depth+1 (add1 depth))) (values #'each-any (cons #'Temp vars) (cons #'[Temp Depth+1 #f Var ...] cdecls)))) (((unquote Var) Dots) (Var? #'Var) (values #'each-any (cons #'Var vars) cdecls)) ((expr Dots) (syntax-error #'expr "match-pattern unquote syntax")))) ((Pat Dots) (ellipsis? #'Dots) (let-synvalues* (((Dpat Dvars Dcdecls) (f #'Pat vars cdecls (add1 depth)))) ; (printf "GOT VARS: ~s\n" vars) (with-syntax ((Size (- (length (syntax->list #'Dvars)) ;; RRN: this is screwy: (length (if (syntax? vars) (syntax->list vars) vars)) ))) ;;MCJ wrapped syntax->list around vars (values #'#(each Dpat Size) #'Dvars #'Dcdecls)))) ((Pat Dots . Rest) (ellipsis? #'Dots) (let-synvalues* (((Rpat Rvars Rcdecls) (f #'Rest vars cdecls depth)) ((Dpat Dvars Dcdecls) (f #'(Pat (... ...)) #'Rvars #'Rcdecls depth))) (with-syntax ((Size (- (length (syntax->list #'Dvars)) (length (syntax->list #'Rvars)))) ((RevRestTl . RevRest) (reverseX (syntax->list #'Rpat) '()))) (values #'#(tail-each Dpat Size RevRest RevRestTl) #'Dvars #'Dcdecls)))) ((X . Y) (let-synvalues* (((Ypat Yvars Ycdecls) (f #'Y vars cdecls depth)) ((Xpat Xvars Xcdecls) (f #'X #'Yvars #'Ycdecls depth))) (values #'(Xpat . Ypat) #'Xvars #'Xcdecls))) (() (values #'() vars cdecls)) (#(X ...) (let-synvalues* (((Pat Vars CDecls) (f #'(X ...) vars cdecls depth))) (values #'#(vector Pat) #'Vars #'CDecls))) (Thing (values #'#(atom Thing) vars cdecls)))) (define reverseX (lambda (ls acc) (if (pair? ls) (reverseX (cdr ls) (cons (car ls) acc)) (cons ls acc)))) (lambda (syn) (syntax-case syn () ((_ syn (kh . kt)) (let-synvalues* (((a b c) (f #'syn '() '() 0))) #'(kh 'a b c . kt))))))) (define-syntax mapper (lambda (x) (syntax-case x () ((_ F (RetId ...) (ThreadId ...)) (with-syntax (((t ...) (generate-temporaries #'(RetId ...))) ((ts ...) (generate-temporaries #'(RetId ...))) ((null ...) (map (lambda (x) #'()) (syntax->list #'(RetId ...))))) #'(let ((fun F)) (rec g (lambda (ThreadId ... ls) (if (null? ls) (values ThreadId ... null ...) (call-with-values (lambda () (g ThreadId ... (cdr ls))) (lambda (ThreadId ... ts ...) (call-with-values (lambda () (fun ThreadId ... (car ls))) (lambda (ThreadId ... t ...) (values ThreadId ... (cons t ts) ...)))))))))))))) ;;; ------------------------------ (define-syntax my-backquote (lambda (x) (define ellipsis? (lambda (x) (and (identifier? x) (eq? (syntax-e x) '...)))) (define destruct (lambda (Orig x depth) (syntax-case x (quasiquote unquote unquote-splicing) ;; inner quasiquote ((quasiquote Exp) (with-values (destruct Orig #'Exp (add1 depth)) (syntax-lambda (Builder Vars Exps) (if (null? #'Vars) (values #''(quasiquote Exp) '() '()) (values #'(list 'quasiquote Builder) #'Vars #'Exps))))) ;; unquote ((unquote Exp) (zero? depth) (with-temp X (values #'X (list #'X) (list #'Exp)))) ((unquote Exp) (with-values (destruct Orig #'Exp (sub1 depth)) (syntax-lambda (Builder Vars Exps) (if (null? #'Vars) (values #''(unquote Exp) '() '()) (values #'(list 'unquote Builder) #'Vars #'Exps))))) ;; splicing (((unquote-splicing Exp)) (zero? depth) (with-temp X (values #'X (list #'X) (list #'Exp)))) (((unquote-splicing Exp ...)) (zero? depth) (with-temps (X ...) (Exp ...) (values #'(append X ...) #'(X ...) #'(Exp ...)))) (((unquote-splicing Exp ...) . Rest) (zero? depth) (with-values (destruct Orig #'Rest depth) (syntax-lambda (Builder Vars Exps) (with-temps (X ...) (Exp ...) (if (null? #'Vars) (values #'(append X ... 'Rest) #'(X ...) #'(Exp ...)) (values #'(append X ... Builder) #'(X ... . Vars) #'(Exp ... . Exps))))))) ((unquote-splicing Exp ...) (with-values (destruct Orig #'(Exp ...) (sub1 depth)) (syntax-lambda (Builder Vars Exps) (if (null? #'Vars) (values #''(unquote-splicing Exp ...) '() '()) (values #'(cons 'unquote-splicing Builder) #'Vars #'Exps))))) ;; dots (((unquote Exp) Dots) (and (zero? depth) (ellipsis? #'Dots)) (with-temp X (values #'X (list #'X) (list #'Exp)))) (((unquote Exp) Dots . Rest) (and (zero? depth) (ellipsis? #'Dots)) (with-values (destruct Orig #'Rest depth) (syntax-lambda (RestBuilder RestVars RestExps) (with-syntax ((TailExp (if (stx-null? #'RestVars) #''Rest #'RestBuilder))) (with-temp X (values #'(append X TailExp) (cons #'X #'RestVars) (cons #'Exp #'RestExps))))))) ((Exp Dots . Rest) (and (zero? depth) (ellipsis? #'Dots)) (with-values (destruct Orig #'Exp depth) (syntax-lambda (ExpBuilder (ExpVar ...) (ExpExp ...)) (if (null? #'(ExpVar ...)) (syntax-error Orig "Bad ellipsis") (with-values (destruct Orig #'Rest depth) (syntax-lambda (RestBuilder RestVars RestExps) (with-syntax ((TailExp (if (null? #'RestVars) #''Rest #'RestBuilder)) (Orig Orig)) (values #'(let f ((ExpVar ExpVar) ...) (if (and (pair? ExpVar) ...) (cons (let ((ExpVar (car ExpVar)) ...) ExpBuilder) (f (cdr ExpVar) ...)) (if (and (null? ExpVar) ...) TailExp (error 'unquote "Mismatched lists in ~s" Orig)))) (append (syntax->list #'(ExpVar ...)) (syntax->list #'RestVars)) (append (syntax->list #'(ExpExp ...)) (syntax->list #'RestExps)))))))))) ;; Vectors (#(X ...) (with-values (destruct Orig #'(X ...) depth) (syntax-lambda (LsBuilder LsVars LsExps) (values #'(list->vector LsBuilder) #'LsVars #'LsExps)))) ;; random stuff ((Hd . Tl) (with-values (destruct Orig #'Hd depth) (syntax-lambda (HdBuilder HdVars HdExps) (with-values (destruct Orig #'Tl depth) (syntax-lambda (TlBuilder TlVars TlExps) (with-syntax ((Hd (if (null? #'HdVars) #''Hd #'HdBuilder)) (Tl (if (null? #'TlVars) #''Tl #'TlBuilder))) (values #'(cons Hd Tl) (append (syntax->list #'HdVars) (syntax->list #'TlVars)) (append (syntax->list #'HdExps) (syntax->list #'TlExps))))))))) (OtherThing (values #''OtherThing '() '()))))) ;; macro begins (syntax-case x () ((_ Datum) (with-values (destruct #'(quasiquote Datum) #'Datum 0) (syntax-lambda (Builder (Var ...) (Exp ...)) (if (null? #'(Var ...)) #''Datum #'(let ((Var Exp) ...) Builder)))))))) (define-syntax extend-backquote (lambda (x) (syntax-case x () ((_ Template Exp ...) (with-syntax ((quasiquote (datum->syntax-object #'Template 'quasiquote))) #'(let-syntax ((quasiquote (lambda (x) (syntax-case x () ((_ Foo) #'(my-backquote Foo)))))) Exp ...)))))) ;;; ------------------------------ (define-syntax with-values (syntax-rules () ((_ P C) (call-with-values (lambda () P) C)))) (define-syntax letcc (syntax-rules () ((_ V B0 B ...) (call/ec (lambda (V) B0 B ...))))) (define classify-list (lambda (ls) (cond ((null? ls) 'proper) ((not (pair? ls)) 'improper) (else (let f ((tortoise ls) (hare (cdr ls))) (cond ((eq? tortoise hare) 'infinite) ((null? hare) 'proper) ((not (pair? hare)) 'improper) (else (let ((hare (cdr hare))) (cond ((null? hare) 'proper) ((not (pair? hare)) 'improper) (else (f (cdr ls) (cdr hare)))))))))))) (define ilist-copy-flat (lambda (ils) (let f ((tortoise ils) (hare (cdr ils))) (if (eq? tortoise hare) (list (car tortoise)) (cons (car tortoise) (f (cdr tortoise) (cddr hare))))))) (define sexp-dispatch (lambda (obj pat);; #f or list of vars (letcc escape (let ((fail (lambda () (escape #f)))) (let f ((pat pat) (obj obj) (vals '())) (cond ((eq? pat 'any) (cons obj vals)) ((eq? pat 'each-any) ;; handle infinities (case (classify-list obj) ((proper infinite) (cons obj vals)) ((improper) (fail)))) ((pair? pat) (if (pair? obj) (f (car pat) (car obj) (f (cdr pat) (cdr obj) vals)) (fail))) ((vector? pat) (case (vector-ref pat 0) ((atom) (let ((a (vector-ref pat 1))) (if (eqv? obj a) vals (fail)))) ((vector) (if (vector? obj) (let ((vec-pat (vector-ref pat 1))) (f vec-pat (vector->list obj) vals)) (fail))) ((each) ;; if infinite, copy the list as flat, then do the matching, ;; then do some set-cdrs. (let ((each-pat (vector-ref pat 1)) (each-size (vector-ref pat 2))) (case (classify-list obj) ((improper) (fail)) ((infinite) (error 'iu-match "not handling infinite lists at the moment...") #; (let ((each-vals (f pat (ilist-copy-flat obj) '()))) (for-each (lambda (x) (set-cdr! (last-pair x) x)) each-vals) (append each-vals vals))) ((proper) (append (let g ((obj obj)) (if (null? obj) (make-list each-size '()) (let ((hd-vals (f each-pat (car obj) '())) (tl-vals (g (cdr obj)))) (map cons hd-vals tl-vals)))) vals))))) ((tail-each) (let ((each-pat (vector-ref pat 1)) (each-size (vector-ref pat 2)) (revtail-pat (vector-ref pat 3)) (revtail-tail-pat (vector-ref pat 4))) (when (eq? (classify-list obj) 'infinite) (fail)) (with-values (let g ((obj obj)) ;; in-tail?, vals, revtail-left/ls (cond ((pair? obj) (with-values (g (cdr obj)) (lambda (in-tail? vals tail-left/ls) (if in-tail? (if (null? tail-left/ls) (values #f vals (list (car obj))) (values #t (f (car tail-left/ls) (car obj) vals) (cdr tail-left/ls))) (values #f vals (cons (car obj) tail-left/ls)))))) (else (values #t (f revtail-tail-pat obj '()) revtail-pat)))) (lambda (in-tail? vals tail-left/ls) (if in-tail? (if (null? tail-left/ls) (append (make-list each-size '()) vals) (fail)) (f each-pat tail-left/ls vals)))))))) (else (if (eqv? obj pat) vals (fail))))))))) ) ;;;CHANGELOG ;; [04 Nov 2005] ;; rrn fixed a bug in the code for handling ellipses. ;; (was (length (syntax->list vars)) rather than (length vars)) ;; [13 March 2002] ;; rkd added following change by Friedman and Ganz to the main source ;; code thread and fixed a couple of minor problems. ;; [9 March 2002] ;; Dan Friedman and Steve Ganz added the ability to use identical pattern ;; variables. The patterns represented by the variables are compared ;; using the value of the parameter match-equality-test, which defaults ;; to equal?. ;; ;; > (match '(1 2 1 2 1) ;; [(,a ,b ,a ,b ,a) (guard (number? a) (number? b)) (+ a b)]) ;; 3 ;; ;; ;; > (match '((1 2 3) 5 (1 2 3)) ;; [((,a ...) ,b (,a ...)) `(,a ... ,b)]) ;; (1 2 3 5) ;; ;; ;; > (parameterize ([match-equality-test (lambda (x y) (equal? x (reverse y)))]) ;; (match '((1 2 3) (3 2 1)) ;; [(,a ,a) 'yes] ;; [,oops 'no])) ;; yes ;; [10 Jan 2002] ;; eh fixed bug that caused (match '((1 2 3 4)) (((,a ... ,d) . ,x) a)) to ;; blow up. The bug was caused by a bug in the sexp-dispatch procedure ;; where a base value empty list was passed to an accumulator from inside ;; the recursion, instead of passing the old value of the accumulator. ;; [14 Jan 2001] ;; rkd added syntax checks to unquote pattern parsing to weed out invalid ;; patterns like ,#(a) and ,[(vector-ref d 1)]. ;; [14 Jan 2001] ;; rkd added ,[Cata -> Id* ...] to allow specification of recursion ;; function. ,[Id* ...] recurs to match; ,[Cata -> Id* ...] recurs ;; to Cata. ;; [14 Jan 2001] ;; rkd tightened up checks for ellipses and nested quasiquote; was comparing ;; symbolic names, which, as had been noted in the source, is a possible ;; hygiene bug. Replaced error call in guard-body with syntax-error to ;; allow error to include source line/character information. ;; [13 Jan 2001] ;; rkd fixed match patterns of the form (stuff* ,[x] ... stuff+), which ;; had been recurring on subforms of each item rather than on the items ;; themselves. ;; [29 Feb 2000] ;; Fixed a case sensitivity bug. ;; [24 Feb 2000] ;; Matcher now handles vector patterns. Quasiquote also handles ;; vector patterns, but does NOT do the csv6.2 optimization of ;; `#(a 1 ,(+ 3 4) x y) ==> (vector 'a 1 (+ 3 4) 'x 'y). ;; Also fixed bug in (P ... . P) matching code. ;; [23 Feb 2000] ;; KSM fixed bug in unquote-splicing inside quasiquote. ;; [10 Feb 2000] ;; New forms match+ and trace-match+ thread arguments right-to-left. ;; The pattern (P ... . P) now works the way you might expect. ;; Infinite lists are now properly matched (and not matched). ;; Removed the @ pattern. ;; Internal: No longer converting into syntax-case. ;; [6 Feb 2000] ;; Added expansion-time error message for referring to cata variable ;; in a guard. ;; [4 Feb 2000] ;; Fixed backquote so it can handle nested backquote (oops). ;; Double-backquoted elipses are neutralized just as double-backquoted ;; unquotes are. So: ;; `(a ,'(1 2 3) ... b) =eval=> (a 1 2 3 b) ;; ``(a ,'(1 2 3) ... b) =eval=> `(a ,'(1 2 3) ... b) ;; ``(a ,(,(1 2 3) ...) b) =eval=> `(a ,(1 2 3) b) ;; Added support for ;; `((unquote-splicing x y z) b) =expand==> (append x y z (list 'b)) ;; [1 Feb 2000] ;; Fixed a bug involving forgetting to quote stuff in the revised backquote. ;; Recognized unquote-splicing and signalled errors in the appropriate places. ;; Added support for deep elipses in backquote. ;; Rewrote backquote so it does the rebuilding directly instead of ;; expanding into Chez's backquote. ;; [31 Jan 2000] ;; Kent Dybvig fixed template bug. ;; [31 Jan 2000] ;; Added the trace-match form, and made guards contain ;; an explicit and expression: ;; (guard E ...) ==> (guard (and E ...)) ;; [26 Jan 2000] ;; Inside the clauses of match expressions, the following ;; transformation is performed inside backquote expressions: ;; ,v ... ==> ,@v ;; (,v ,w) ... ==> ,@(map list v w) ;; etc. ;(require iu-match) ;; RRN: DEBUGGING ;(define (f) ; (match '((1 2) (3 4)) ; [((,x ,y) ...) y])) ;(f)PhyBin-0.3/old_script/match.ss000066400000000000000000000707231221636131000163560ustar00rootroot00000000000000;;; match.ss ;;; Time-stamp: Sun Jan 14 09:04:21 EST 2001 rkd ;; [14 Jan 2001] ;; rkd added syntax checks to unquote pattern parsing to weed out invalid ;; patterns like ,#(a) and ,[(vector-ref d 1)]. ;; [14 Jan 2001] ;; rkd added ,[Cata -> Id* ...] to allow specification of recursion ;; function. ,[Id* ...] recurs to match; ,[Cata -> Id* ...] recurs ;; to Cata. ;; [14 Jan 2001] ;; rkd tightened up checks for ellipses and nested quasiquote; was comparing ;; symbolic names, which, as had been noted in the source, is a possible ;; hygiene bug. Replaced error call in guard-body with syntax-error to ;; allow error to include source line/character information. ;; [13 Jan 2001] ;; rkd fixed match patterns of the form (stuff* ,[x] ... stuff+), which ;; had been recurring on subforms of each item rather than on the items ;; themselves. ;; Previous changelog listings at end of file. ;; ============================================================= ;; Exp ::= (match Exp Clause) ;; || (trace-match Exp Clause) ;; || (match+ (Id*) Exp Clause*) ;; || (trace-match+ (Id*) Exp Clause*) ;; || OtherSchemeExp ;; Clause ::= (Pat Exp+) || (Pat (guard Exp*) Exp+) ;; Pat ::= (Pat ... . Pat) ;; || (Pat . Pat) ;; || () ;; || #(Pat* Pat ... Pat*) ;; || #(Pat*) ;; || ,Id ;; || ,[Id*] ;; || ,[Cata -> Id*] ;; || Id ;; Cata ::= Exp ;; YOU'RE NOT ALLOWED TO REFER TO CATA VARS IN GUARDS. (reasonable!) (module iu-match ((match+ match-help match-help1 clause-body let-values** guard-body convert-pat mapper my-backquote extend-backquote sexp-dispatch) (trace-match+ match-help match-help1 clause-body let-values** guard-body convert-pat mapper my-backquote extend-backquote sexp-dispatch) (match match-help match-help1 clause-body let-values** guard-body convert-pat mapper my-backquote extend-backquote sexp-dispatch) (trace-match match-help match-help1 clause-body let-values** guard-body convert-pat mapper my-backquote extend-backquote sexp-dispatch) ;; TEMP clause-body extend-backquote ) ;(import scheme) (define-syntax match+ (lambda (x) (syntax-case x () [(_ (ThreadedId ...) Exp Clause ...) #'(let f ((ThreadedId ThreadedId) ... (x Exp)) (match-help _ f x (ThreadedId ...) Clause ...))]))) (define-syntax match (lambda (x) (syntax-case x () [(_ Exp Clause ...) #'(let f ((x Exp)) (match-help _ f x () Clause ...))]))) (define-syntax trace-match+ (lambda (x) (syntax-case x () [(_ (ThreadedId ...) Name Exp Clause ...) #'(letrec ((f (trace-lambda Name (ThreadedId ... x) (match-help _ f x (ThreadedId ...) Clause ...)))) (f ThreadedId ... x))]))) (define-syntax trace-match (lambda (x) (syntax-case x () [(_ Name Exp Clause ...) #'(letrec ((f (trace-lambda Name (x) (match-help _ f x () Clause ...)))) (f Exp))]))) ;;; ------------------------------ (define-syntax let-values** (syntax-rules () ((_ () B0 B ...) (begin B0 B ...)) ((_ ((Formals Exp) Rest ...) B0 B ...) (let-values** (Rest ...) (call-with-values (lambda () Exp) (lambda Formals B0 B ...)))))) (define-syntax match-help (lambda (x) (syntax-case x () ((_ Template Cata Obj ThreadedIds) ;(inspect `(HRM ,(datum Template) ,(datum Cata) ,(datum Obj) ,(datum ThreadedIds))) ;(inspect #'Template) #'(error 'match "Unmatched datum.\n Datum: ~s\n Source-Location: ~s\n" Obj #'Template)) ((_ Template Cata Obj ThreadedIds (Pat B0 B ...) Rest ...) #'(convert-pat Pat (match-help1 Template Cata Obj ThreadedIds (B0 B ...) Rest ...))) ))) (define-syntax match-help1 (syntax-rules (guard) ((_ PatLit Vars Cdecls Template Cata Obj ThreadedIds ((guard G ...) B0 B ...) Rest ...) (let ((ls/false (sexp-dispatch Obj PatLit))) (if (and ls/false (apply (lambda Vars (guard-body Cdecls (extend-backquote Template (and G ...)))) ls/false)) (apply (lambda Vars (clause-body Cata Cdecls ThreadedIds (extend-backquote Template B0 B ...))) ls/false) (match-help Template Cata Obj ThreadedIds Rest ...)))) ((_ PatLit Vars Cdecls Template Cata Obj ThreadedIds (B0 B ...) Rest ...) (let ((ls/false (sexp-dispatch Obj PatLit))) (if ls/false (apply (lambda Vars (clause-body Cata Cdecls ThreadedIds (extend-backquote Template B0 B ...))) ls/false) (match-help Template Cata Obj ThreadedIds Rest ...)))))) ;((clause-body f () () (extend-backquote _ (void)))) (define-syntax clause-body (lambda (x) (define build-mapper (lambda (vars depth cata tIds) (if (zero? depth) cata (with-syntax ((rest (build-mapper vars (- depth 1) cata tIds)) (vars vars) (tIds tIds)) #'(mapper rest vars tIds))))) (syntax-case x () ((_ Cata ((CVar CDepth CMyCata CFormal ...) ...) (ThreadedId ...) B) ;;; RRN: Hacking this a bit for larceny compat: (with-syntax ((((TIDs ...) ...) (datum->syntax-object #'_ (map (lambda _ #'(ThreadedId ...)) (syntax-object->datum #'((CFormal ...) ...)))))) (with-syntax (((Mapper ...) (map (lambda (mycata formals depth) (build-mapper formals (syntax-object->datum depth) (syntax-case mycata () [#f #'Cata] [exp #'exp]) #'(ThreadedId ...))) #'(CMyCata ...) #'((CFormal ...) ...) #'(CDepth ...)))) ;; [2007.12.26] RRN: Removing ellipses from references to ThreadedId here: ;; This appears to have been a bug. Introducing TIDs instead. #'(let-values** (([TIDs ... CFormal ...] (Mapper TIDs ... CVar)) ...) B) ) ))))) (define-syntax guard-body (lambda (x) (syntax-case x () ((_ ((Cvar Cdepth MyCata Cformal ...) ...) B) (with-syntax (((CF ...) (apply append #'((Cformal ...) ...)))) #'(let-syntax ((CF (lambda (x) (syntax-case x () (Name (syntax-error #'Name "guard cannot refer to return-value variable"))))) ...) B)))))) (define-syntax convert-pat ;; returns sexp-pat x vars x cdecls (let () (define ellipsis? (lambda (x) (and (identifier? x) (literal-identifier=? x #'(... ...))))) (define Var? (lambda (x) (syntax-case x (->) [-> #f] [id (identifier? #'id)]))) (define (f syn vars cdecls depth) (syntax-case syn (unquote) ((unquote . stuff) ; separate for better error detection (syntax-case syn (unquote ->) ((unquote [MyCata -> Var ...]) (andmap Var? #'(Var ...)) (with-syntax (((Temp) (generate-temporaries '(x))) (Depth depth)) (values #'any (cons #'Temp vars) (cons #'[Temp Depth MyCata Var ...] cdecls)))) ((unquote [Var ...]) (andmap Var? #'(Var ...)) (with-syntax (((Temp) (generate-temporaries '(x))) (Depth depth)) (values #'any (cons #'Temp vars) (cons #'[Temp Depth #f Var ...] cdecls)))) ((unquote Var) (Var? #'Var) (values #'any (cons #'Var vars) cdecls)))) (((unquote . stuff) Dots) ; separate for better error detection (ellipsis? #'Dots) (syntax-case syn (unquote ->) (((unquote [MyCata -> Var ...]) Dots) (andmap Var? #'(Var ...)) (with-syntax (((Temp) (generate-temporaries '(x))) (Depth+1 (+ 1 depth))) (values #'each-any (cons #'Temp vars) (cons #'[Temp Depth+1 MyCata Var ...] cdecls)))) (((unquote [Var ...]) Dots) (andmap Var? #'(Var ...)) (with-syntax (((Temp) (generate-temporaries '(x))) (Depth+1 (+ 1 depth))) (values #'each-any (cons #'Temp vars) (cons #'[Temp Depth+1 #f Var ...] cdecls)))) (((unquote Var) Dots) (Var? #'Var) (values #'each-any (cons #'Var vars) cdecls)) ((expr Dots) (syntax-error #'expr "match-pattern unquote syntax")))) ((Pat Dots) (ellipsis? #'Dots) (let-synvalues* (((Dpat Dvars Dcdecls) (f #'Pat vars cdecls (+ 1 depth)))) (with-syntax ((Size (- (length #'Dvars) (length vars)))) (values #'#(each Dpat Size) #'Dvars #'Dcdecls)))) ((Pat Dots . Rest) (ellipsis? #'Dots) (let-synvalues* (((Rpat Rvars Rcdecls) (f #'Rest vars cdecls depth)) ((Dpat Dvars Dcdecls) (f #'(Pat (... ...)) #'Rvars #'Rcdecls depth))) (with-syntax ((Size (- (length #'Dvars) (length #'Rvars))) ((RevRestTl . RevRest) (reverseX #'Rpat '()))) (values #'#(tail-each Dpat Size RevRest RevRestTl) #'Dvars #'Dcdecls)))) ((X . Y) (let-synvalues* (((Ypat Yvars Ycdecls) (f #'Y vars cdecls depth)) ((Xpat Xvars Xcdecls) (f #'X #'Yvars #'Ycdecls depth))) (values #'(Xpat . Ypat) #'Xvars #'Xcdecls))) (() (values #'() vars cdecls)) (#(X ...) (let-synvalues* (((Pat Vars CDecls) (f #'(X ...) vars cdecls depth))) (values #'#(vector Pat) #'Vars #'CDecls))) (Thing (values #'#(atom Thing) vars cdecls)))) (define reverseX (lambda (ls acc) (if (pair? ls) (reverseX (cdr ls) (cons (car ls) acc)) (cons ls acc)))) (define-syntax let-synvalues* (syntax-rules () ((_ () B0 B ...) (begin B0 B ...)) ((_ (((Formal ...) Exp) Decl ...) B0 B ...) (call-with-values (lambda () Exp) (lambda (Formal ...) (with-syntax ((Formal Formal) ...) (let-synvalues* (Decl ...) B0 B ...))))))) (lambda (syn) (syntax-case syn () ((_ syn (kh . kt)) (let-synvalues* (((a b c) (f #'syn '() '() 0))) #'(kh 'a b c . kt))))))) (define-syntax mapper (lambda (x) (syntax-case x () ((_ F (RetId ...) (ThreadId ...)) (with-syntax (((t ...) (generate-temporaries #'(RetId ...))) ((ts ...) (generate-temporaries #'(RetId ...))) ((null ...) (map (lambda (x) #'()) #'(RetId ...)))) #'(let ((fun F)) (rec g (lambda (ThreadId ... ls) (if (null? ls) (values ThreadId ... null ...) (call-with-values (lambda () (g ThreadId ... (cdr ls))) (lambda (ThreadId ... ts ...) (call-with-values (lambda () (fun ThreadId ... (car ls))) (lambda (ThreadId ... t ...) (values ThreadId ... (cons t ts) ...)))))))))))))) ;;; ------------------------------ (define-syntax my-backquote (lambda (x) (define ellipsis? (lambda (x) (and (identifier? x) (literal-identifier=? x #'(... ...))))) (define-syntax with-values (syntax-rules () ((_ P C) (call-with-values (lambda () P) C)))) (define-syntax syntax-lambda (lambda (x) (syntax-case x () ((_ (Pat ...) Body0 Body ...) (with-syntax (((X ...) (generate-temporaries #'(Pat ...)))) #'(lambda (X ...) (with-syntax ((Pat X) ...) Body0 Body ...))))))) (define-syntax with-temp (syntax-rules () ((_ V Body0 Body ...) (with-syntax (((V) (generate-temporaries '(x)))) Body0 Body ...)))) (define-syntax with-temps (syntax-rules () ((_ (V ...) (Exp ...) Body0 Body ...) (with-syntax (((V ...) (generate-temporaries #'(Exp ...)))) Body0 Body ...)))) (define destruct (lambda (Orig x depth) (syntax-case x (quasiquote unquote unquote-splicing) ;; inner quasiquote ((quasiquote Exp) (with-values (destruct Orig #'Exp (+ 1 depth)) (syntax-lambda (Builder Vars Exps) (if (null? #'Vars) (values #''(quasiquote Exp) '() '()) (values #'(list 'quasiquote Builder) #'Vars #'Exps))))) ;; unquote ((unquote Exp) (zero? depth) (with-temp X (values #'X (list #'X) (list #'Exp)))) ((unquote Exp) (with-values (destruct Orig #'Exp (+ -1 depth)) (syntax-lambda (Builder Vars Exps) (if (null? #'Vars) (values #''(unquote Exp) '() '()) (values #'(list 'unquote Builder) #'Vars #'Exps))))) ;; splicing (((unquote-splicing Exp)) (zero? depth) (with-temp X (values #'X (list #'X) (list #'Exp)))) (((unquote-splicing Exp ...)) (zero? depth) (with-temps (X ...) (Exp ...) (values #'(append X ...) #'(X ...) #'(Exp ...)))) (((unquote-splicing Exp ...) . Rest) (zero? depth) (with-values (destruct Orig #'Rest depth) (syntax-lambda (Builder Vars Exps) (with-temps (X ...) (Exp ...) (if (null? #'Vars) (values #'(append X ... 'Rest) #'(X ...) #'(Exp ...)) (values #'(append X ... Builder) #'(X ... . Vars) #'(Exp ... . Exps))))))) ((unquote-splicing Exp ...) (with-values (destruct Orig #'(Exp ...) (+ -1 depth)) (syntax-lambda (Builder Vars Exps) (if (null? #'Vars) (values #''(unquote-splicing Exp ...) '() '()) (values #'(cons 'unquote-splicing Builder) #'Vars #'Exps))))) ;; dots (((unquote Exp) Dots) (and (zero? depth) (ellipsis? #'Dots)) (with-temp X (values #'X (list #'X) (list #'Exp)))) (((unquote Exp) Dots . Rest) (and (zero? depth) (ellipsis? #'Dots)) (with-values (destruct Orig #'Rest depth) (syntax-lambda (RestBuilder RestVars RestExps) (with-syntax ((TailExp (if (null? #'RestVars) #''Rest #'RestBuilder))) (with-temp X (values #'(append X TailExp) (cons #'X #'RestVars) (cons #'Exp #'RestExps))))))) ((Exp Dots . Rest) (and (zero? depth) (ellipsis? #'Dots)) (with-values (destruct Orig #'Exp depth) (syntax-lambda (ExpBuilder (ExpVar ...) (ExpExp ...)) (if (null? #'(ExpVar ...)) (syntax-error Orig "Bad ellipsis") (with-values (destruct Orig #'Rest depth) (syntax-lambda (RestBuilder RestVars RestExps) (with-syntax ((TailExp (if (null? #'RestVars) #''Rest #'RestBuilder)) (Orig Orig)) (values #'(let f ((ExpVar ExpVar) ...) (if (and (pair? ExpVar) ...) (cons (let ((ExpVar (car ExpVar)) ...) ExpBuilder) (f (cdr ExpVar) ...)) (if (and (null? ExpVar) ...) TailExp (error 'unquote "Mismatched lists in ~s" Orig)))) (append #'(ExpVar ...) #'RestVars) (append #'(ExpExp ...) #'RestExps))))))))) ;; Vectors (#(X ...) (with-values (destruct Orig #'(X ...) depth) (syntax-lambda (LsBuilder LsVars LsExps) (values #'(list->vector LsBuilder) #'LsVars #'LsExps)))) ;; random stuff ((Hd . Tl) (with-values (destruct Orig #'Hd depth) (syntax-lambda (HdBuilder HdVars HdExps) (with-values (destruct Orig #'Tl depth) (syntax-lambda (TlBuilder TlVars TlExps) (with-syntax ((Hd (if (null? #'HdVars) #''Hd #'HdBuilder)) (Tl (if (null? #'TlVars) #''Tl #'TlBuilder))) (values #'(cons Hd Tl) (append #'HdVars #'TlVars) (append #'HdExps #'TlExps)))))))) (OtherThing (values #''OtherThing '() '()))))) ;; macro begins (syntax-case x () ((_ Datum) (with-values (destruct #'(quasiquote Datum) #'Datum 0) (syntax-lambda (Builder (Var ...) (Exp ...)) (if (null? #'(Var ...)) #''Datum #'(let ((Var Exp) ...) Builder)))))))) (define-syntax extend-backquote (lambda (x) (syntax-case x () ((_ Template Exp ...) (with-syntax ((quasiquote (datum->syntax-object #'Template 'quasiquote))) #'(let-syntax ((quasiquote (lambda (x) (syntax-case x () ((_ Foo) #'(my-backquote Foo)))))) Exp ...)))))) ;;; ------------------------------ (define-syntax with-values (syntax-rules () ((_ P C) (call-with-values (lambda () P) C)))) (define-syntax letcc (syntax-rules () ((_ V B0 B ...) (call/1cc (lambda (V) B0 B ...))))) (define classify-list (lambda (ls) (cond ((null? ls) 'proper) ((not (pair? ls)) 'improper) (else (let f ((tortoise ls) (hare (cdr ls))) (cond ((eq? tortoise hare) 'infinite) ((null? hare) 'proper) ((not (pair? hare)) 'improper) (else (let ((hare (cdr hare))) (cond ((null? hare) 'proper) ((not (pair? hare)) 'improper) (else (f (cdr ls) (cdr hare)))))))))))) (define ilist-copy-flat (lambda (ils) (let f ((tortoise ils) (hare (cdr ils))) (if (eq? tortoise hare) (list (car tortoise)) (cons (car tortoise) (f (cdr tortoise) (cddr hare))))))) (define sexp-dispatch (lambda (obj pat);; #f or list of vars (letcc escape (let ((fail (lambda () (escape #f)))) (let f ((pat pat) (obj obj) (vals '())) (cond ((eq? pat 'any) (cons obj vals)) ((eq? pat 'each-any) ;; handle infinities (case (classify-list obj) ((proper infinite) (cons obj vals)) ((improper) (fail)))) ((pair? pat) (if (pair? obj) (f (car pat) (car obj) (f (cdr pat) (cdr obj) vals)) (fail))) ((vector? pat) (case (vector-ref pat 0) ((atom) (let ((a (vector-ref pat 1))) (if (eqv? obj a) vals (fail)))) ((vector) (if (vector? obj) (let ((vec-pat (vector-ref pat 1))) (f vec-pat (vector->list obj) vals)) (fail))) ((each) ;; if infinite, copy the list as flat, then do the matching, ;; then do some set-cdrs. (let ((each-pat (vector-ref pat 1)) (each-size (vector-ref pat 2))) (case (classify-list obj) ((improper) (fail)) ((infinite) (let ((each-vals (f pat (ilist-copy-flat obj) '()))) (for-each (lambda (x) (set-cdr! (last-pair x) x)) each-vals) (append each-vals vals))) ((proper) (append (let g ((obj obj)) (if (null? obj) (make-list each-size '()) (let ((hd-vals (f each-pat (car obj) '())) (tl-vals (g (cdr obj)))) (map cons hd-vals tl-vals)))) vals))))) ((tail-each) (let ((each-pat (vector-ref pat 1)) (each-size (vector-ref pat 2)) (revtail-pat (vector-ref pat 3)) (revtail-tail-pat (vector-ref pat 4))) (when (eq? (classify-list obj) 'infinite) (fail)) (with-values (let g ((obj obj)) ;; in-tail?, vals, revtail-left/ls (cond ((pair? obj) (with-values (g (cdr obj)) (lambda (in-tail? vals tail-left/ls) (if in-tail? (if (null? tail-left/ls) (values #f vals (list (car obj))) (values #t (f (car tail-left/ls) (car obj) vals) (cdr tail-left/ls))) (values #f vals (cons (car obj) tail-left/ls)))))) (else (values #t (f revtail-tail-pat obj '()) revtail-pat)))) (lambda (in-tail? vals tail-left/ls) (if in-tail? (if (null? tail-left/ls) (append (make-list each-size '()) vals) (fail)) (f each-pat tail-left/ls vals)))))))) (else (if (eqv? obj pat) vals (fail))))))))) ) ;; End Module #| ;;; examples of passing along threaded information. ;;; Try (collect-symbols '(if (x y 'a 'c zz) 'b 'c)) ;;; Note that it commonizes the reference to c. (define-syntax with-values (syntax-rules () ((_ P C) (call-with-values (lambda () P) C)))) (define collect-symbols (lambda (exp) (with-values (collect-symbols-help exp) (lambda (symbol-decls exp) (match symbol-decls (((,symbol-name . ,symbol-var) ...) `(let ((,symbol-var (quote ,symbol-name)) ...) ,exp))))))) (define collect-symbols-help (lambda (exp) (let ((symbol-env '())) (match+ (symbol-env) exp (,x (guard (symbol? x)) (values symbol-env x)) ((quote ,x) (guard (symbol? x)) (let ((pair/false (assq x symbol-env))) (if pair/false (values symbol-env (cdr pair/false)) (let ((v (gensym))) (values (cons (cons x v) symbol-env) v))))) ((quote ,x) (values symbol-env `(quote ,x))) ((if ,[t] ,[c] ,[a]) (values symbol-env `(if ,t ,c ,a))) ((,[op] ,[arg] ...) (values symbol-env `(,op ,arg ...))))))) ;;; the grammar for this one is just if-exprs and everything else (define collect-leaves (lambda (exp acc) (match+ (acc) exp ((if ,[] ,[] ,[]) acc) ((,[] ,[] ...) acc) (,x (cons x acc))))) ;; here's something that takes apart quoted stuff. (define destruct (lambda (datum) (match datum (() `'()) ((,[X] . ,[Y])`(cons ,X ,Y)) (#(,[X] ...) `(vector ,X ...)) (,thing (guard (symbol? thing)) `',thing) (,thing thing)))) ;; examples using explicit Catas (define sumsquares (lambda (ls) (define square (lambda (x) (* x x))) (match ls [(,[a*] ...) (apply + a*)] [,[square -> n] n]))) (define sumsquares (lambda (ls) (define square (lambda (x) (* x x))) (let ([acc 0]) (match+ (acc) ls [(,[] ...) acc] [,[(lambda (acc x) (+ acc (square x))) ->] acc])))) ;;; The following uses explicit Catas to parse programs in the ;;; simple language defined by the grammar below ;;; -> (program * ) ;;; -> (if ) ;;; | (set! ) ;;; -> ;;; | ;;; | (if ) ;;; | ( ) (define parse (lambda (x) (define Prog (lambda (x) (match x [(program ,[Stmt -> s*] ... ,[Expr -> e]) `(begin ,s* ... ,e)] [,other (error 'parse "invalid program ~s" other)]))) (define Stmt (lambda (x) (match x [(if ,[Expr -> e] ,[Stmt -> s1] ,[Stmt -> s2]) `(if ,e ,s1 ,s2)] [(set! ,v ,[Expr -> e]) (guard (symbol? v)) `(set! ,v ,e)] [,other (error 'parse "invalid statement ~s" other)]))) (define Expr (lambda (x) (match x [,v (guard (symbol? v)) v] [,n (guard (integer? n)) n] [(if ,[e1] ,[e2] ,[e3]) `(if ,e1 ,e2 ,e3)] [(,[rator] ,[rand*] ...) `(,rator ,rand* ...)] [,other (error 'parse "invalid expression ~s" other)]))) (Prog x))) ;;; (parse '(program (set! x 3) (+ x 4)))) => (begin (set! x 3) (+ x 4)) ;; CHANGELOG (most recent changes are logged at the top of this file) ;; [29 Feb 2000] ;; Fixed a case sensitivity bug. ;; [24 Feb 2000] ;; Matcher now handles vector patterns. Quasiquote also handles ;; vector patterns, but does NOT do the csv6.2 optimization of ;; `#(a 1 ,(+ 3 4) x y) ==> (vector 'a 1 (+ 3 4) 'x 'y). ;; Also fixed bug in (P ... . P) matching code. ;; [23 Feb 2000] ;; KSM fixed bug in unquote-splicing inside quasiquote. ;; [10 Feb 2000] ;; New forms match+ and trace-match+ thread arguments right-to-left. ;; The pattern (P ... . P) now works the way you might expect. ;; Infinite lists are now properly matched (and not matched). ;; Removed the @ pattern. ;; Internal: No longer converting into syntax-case. ;; [6 Feb 2000] ;; Added expansion-time error message for referring to cata variable ;; in a guard. ;; [4 Feb 2000] ;; Fixed backquote so it can handle nested backquote (oops). ;; Double-backquoted elipses are neutralized just as double-backquoted ;; unquotes are. So: ;; `(a ,'(1 2 3) ... b) =eval=> (a 1 2 3 b) ;; ``(a ,'(1 2 3) ... b) =eval=> `(a ,'(1 2 3) ... b) ;; ``(a ,(,(1 2 3) ...) b) =eval=> `(a ,(1 2 3) b) ;; Added support for ;; `((unquote-splicing x y z) b) =expand==> (append x y z (list 'b)) ;; [1 Feb 2000] ;; Fixed a bug involving forgetting to quote stuff in the revised backquote. ;; Recognized unquote-splicing and signalled errors in the appropriate places. ;; Added support for deep elipses in backquote. ;; Rewrote backquote so it does the rebuilding directly instead of ;; expanding into Chez's backquote. ;; [31 Jan 2000] ;; Kent Dybvig fixed template bug. ;; [31 Jan 2000] ;; Added the trace-match form, and made guards contain ;; an explicit and expression: ;; (guard E ...) ==> (guard (and E ...)) ;; [26 Jan 2000] ;; Inside the clauses of match expressions, the following ;; transformation is performed inside backquote expressions: ;; ,v ... ==> ,@v ;; (,v ,w) ... ==> ,@(map list v w) ;; etc. |#PhyBin-0.3/old_script/parser2.ss000066400000000000000000000023421221636131000166300ustar00rootroot00000000000000#! /bin/bash #| exec chez --script "$0" ${1+"$@"} |# (printf "Called with ~s\n" (command-line-arguments)) (include "match.ss") (import iu-match) ;[2001.07.15] (define port->slist (lambda (p) (let porttoslistloop ([exp (read p)] [acc '()]) (if (eof-object? exp) (begin (close-input-port p) (reverse! acc)) (porttoslistloop (read p) (cons exp acc)))))) ;; [2008.05.07] Adding a hack to tolerate #! lines at the top of the file. (define file->slist (lambda (filename . opts) (let* ([p (apply open-input-file filename opts)] [line1 (get-line p)]) ;; Doesn't allow whitespace! (if (and (>= (string-length line1) 2) (string=? "#!" (substring line1 0 2))) ;; Read the rest of it straight away: (port->slist p) ;; Oops, we need to "put back" that first line We do that by just starting over. (begin (close-input-port p) (port->slist (apply open-input-file filename opts)))) ))) (define sexps (file->slist (car (command-line-arguments)))) (define wstrings (match sexps [() '()] [(,uq ,[x]) (guard (eq? uq 'unquote)) x] [,s (guard (symbol? s)) (symbol->string s)] [(,[hd] . ,[tl]) `(,hd . ,tl)] )) (pretty-print wstrings) PhyBin-0.3/old_script/parser_fullnames.ss000066400000000000000000000113431221636131000206150ustar00rootroot00000000000000#! /bin/bash #| exec mzscheme -qu "$0" ${1+"$@"} |# ;; I'm duplicating the code from parser.ss and hacking it. (module parser_fullnames mzscheme ;; Import the parser and lexer generators. (require (lib "yacc.ss" "parser-tools") (lib "lex.ss" "parser-tools") (prefix : (lib "lex-sre.ss" "parser-tools")) (lib "list.ss") ;(lib "iu-match.ss") "iu-match.ss" ) ; (provide (all-defined)) (require (lib "pretty.ss")) (define-empty-tokens op-tokens (LeftParen RightParen Comma SemiColon Colon EOF )) (define-tokens value-tokens (NAME NUM)) (define-lex-abbrevs (lower-letter (:/ "a" "z")) (upper-letter (:/ #\A #\Z)) (digit (:/ "0" "9")) (digits (:* digit)) (letter (:or lower-letter upper-letter digit "." "'" "-" "/")) (name (:seq (:+ letter) (:* (:seq "_" (:* letter))))) (number (:seq digits "." digits (:? (:seq "E-" digits)))) ) (define ws-lex (lexer-src-pos [(eof) 'EOF] ;; Ignore all whitespace: [(:or #\tab #\space #\newline #\return) (return-without-pos (ws-lex input-port))] ["(" 'LeftParen] [")" 'RightParen] ["," 'Comma] [";" 'SemiColon] [":" 'Colon] [(:seq "'silva_" name "'") (begin ;(printf "TEST ~a\n" (substring lexeme 7 (sub1 (string-length lexeme)))) (token-NAME (substring lexeme 7 (sub1 (string-length lexeme)))) )] [(:seq "silva_" name) (token-NAME (substring lexeme 6 (string-length lexeme)))] ;[number (token-NUM (string->number lexeme))] [number (token-NUM lexeme)] )) (define (format-pos pos) (if (position-line pos) (format "line ~a:~a" (position-line pos) (position-col pos)) (format "char ~a" (position-offset pos)))) (define ws-parse (parser (src-pos) (start wholefile) (end EOF) (tokens value-tokens op-tokens) (error (lambda (a b c start end) (printf "PARSE ERROR: after ~a token, ~a~a.\n" (if a "valid" "invalid") b (if c (format " carrying value ~s" c) "")) (printf " Located between ~a and ~a.\n" (format-pos start) (format-pos end)))) ;; Precedence: ;(precs ) ;(debug "_parser.log") (grammar (wholefile [(tree SemiColon) $1] ;; ACK making semicolon and final tag optional!! [(tree) $1] [(LeftParen nodes+ RightParen) (cons "0.0" $2)] ) (numtag [(Colon NUM) $2]) (tree [(NAME numtag) (list $2 $1)] [(LeftParen nodes+ RightParen numtag) (cons $4 $2)]) (nodes+ [(tree) (list $1)] [(tree Comma nodes+) (cons $1 $3)]) ))) ;; ================================================================================ (define (ws-parse-file f) (let ([p (open-input-file f)]) (let ([res (ws-parse-port p)]) (close-input-port p) res))) (define (ws-parse-port ip) (port-count-lines! ip) ;; cdr (ws-parse (lambda () (flatten (ws-lex ip)))) ;(position-token-token (ws-lex ip)) ) (define (flatten pt) ;(printf " ") (let loop ((x pt)) (if (position-token? (position-token-token x)) (begin (error 'flatten "No nested position-tokens!") (loop (position-token-token x))) x))) (define allargs (vector->list (current-command-line-arguments))) (define pretty? #t) (define (main filename) #; (if pretty? (pretty-print (ws-parse-file filename)) (begin (write (ws-parse-file filename)) (newline) )) (define table '()) (define counter 1) (define (clean-name n) (define matches (regexp-match #rx".*__(.*)" n)) (define cleaned (car (reverse matches))) ;(printf "MATCHES ~a\n" matches) (set! table `((,counter ,cleaned) . ,table)) (set! counter (+ 1 counter)) ;(sub1 counter) ;; THIS IS ALL I CHANGED: cleaned ) (with-output-to-file (string-append filename ".nameonly") (lambda () (let loop ((x (ws-parse-file filename))) (match x ;; A leaf: [(,dist ,name) (guard (string? name)) (printf "~a:~a" (clean-name name) dist)] [(,dist ,children ...) (printf "(") (let inner ((ls children)) (cond [(null? ls) (void)] [(null? (cdr ls)) (loop (car ls))] [else (loop (car ls)) (printf ",") (inner (cdr ls))])) (printf "):~a" dist)] )) (printf ";\n"))) (with-output-to-file (string-append filename ".table") (lambda () (for-each (lambda (ls) (printf "~a ~a\n" (car ls) (cadr ls))) (reverse table)))) ) (print-graph #t) (print-vector-length #f) ;; Here's our script invocation: ;; When run in --persist mode we run in a loop. (if (null? allargs) (error 'wsparse "No filename provided...") (main (car allargs))) (printf "Done writing output files.\n") ;(exit 0) ) ;; End module. PhyBin-0.3/phybin.cabal000066400000000000000000000120021221636131000150100ustar00rootroot00000000000000Name: phybin Version: 0.3 License: BSD3 License-file: LICENSE Stability: Beta Author: Ryan Newton Maintainer: Ryan Newton -- Version history: -- 0.1.2 -- first significant release -- 0.1.2.1 -- -- 0.1.2.4 -- new release, several new features -- 0.1.2.5 -- bump for graphviz API changes -- 0.2 -- Add Robinson-Foulds distance, use Int labels. -- 0.2.2 -- misc changes and expose library -- 0.2.4 -- add consensus trees -- 0.2.6 -- colorization, hashrf, misc improvements -- 0.2.7 -- Add command line opt --showtrees -- 0.2.8 -- Add command line opt --highlight -- 0.2.9 -- Add command line opt --interior -- 0.2.10 -- Add command line opt --matching -- 0.2.11 -- Cleanup and windows compatibility. -- 0.3 -- significant improvements and new functionality. -- homepage: http://code.haskell.org/phybin -- homepage: http://people.csail.mit.edu/newton/phybin/ homepage: http://www.cs.indiana.edu/~rrnewton/projects/phybin/ Copyright: Copyright (c) 2010 Ryan Newton Synopsis: Utility for clustering phylogenetic trees in Newick format based on Robinson-Foulds distance. Description: This package provides a libary and executable for dealing with Newick tree files. . It can do simple binning of identical trees or more complex clustering based on an all-to-all Robinson-Foulds distance matrix. . phybin produces output files that characterize the size and contents of each bin or cluster (including generating GraphViz-based visual representations of the tree topologies). Category: Bioinformatics Cabal-Version: >= 1.8 Extra-source-files: README.md Build-type: Simple Source-repository head type: git location: git://github.com/rrnewton/PhyBin.git Flag hashrf description: Use the HashRF algorithm by default instead of the naive one. default: True Flag bitvec description: Use bitvectors rather than IntSets for bipartitions. default: False Flag sequential description: Don't use any parallelism at all. default: False Library Exposed-modules: Bio.Phylogeny.PhyBin Bio.Phylogeny.PhyBin.Binning Bio.Phylogeny.PhyBin.CoreTypes Bio.Phylogeny.PhyBin.Parser Bio.Phylogeny.PhyBin.PreProcessor Bio.Phylogeny.PhyBin.RFDistance Bio.Phylogeny.PhyBin.Util Bio.Phylogeny.PhyBin.Visualize -- Data.BitList Build-Depends: base >= 3 && < 5, directory, process, containers, async, time, filepath, graphviz >= 2999.16, text >= 0.11 && < 0.12, prettyclass, fgl, HUnit, bytestring, -- For bytestring.lazy support: parsec >= 3.1.0, vector >= 0.10, hierarchical-clustering >= 0.4, split >= 0.2 -- lattice-par, GHC-Options: -O2 -funbox-strict-fields -rtsopts if flag(hashrf) CPP-Options: -DUSE_HASHRF if flag(bitvec) CPP-Options: -DBITVEC_BIPS Build-Depends: bitvec >= 0.1 if flag(sequential) CPP-Options: -DSEQUENTIALIZE Executable phybin Main-is: Main.hs Build-Depends: phybin -- DUPLICATE: Build-Depends: base >= 3 && < 5, directory, process, containers, async, time, filepath, graphviz >= 2999.16, text >= 0.11 && < 0.12, prettyclass, fgl, HUnit, bytestring, -- For bytestring.lazy support: parsec >= 3.1.0, vector >= 0.10, hierarchical-clustering >= 0.4, split >= 0.2 -- lattice-par, GHC-Options: -O2 -funbox-strict-fields -rtsopts -threaded if flag(hashrf) CPP-Options: -DUSE_HASHRF if flag(bitvec) CPP-Options: -DBITVEC_BIPS Build-Depends: bitvec >= 0.1 if flag(sequential) CPP-Options: -DSEQUENTIALIZE -- DUPLICATED from executable: Test-Suite test-phybin Main-is: TestAll.hs Type: exitcode-stdio-1.0 -- Build-Depends: phybin Build-Depends: base >= 3 && < 5, directory, process, containers, async, time, filepath, graphviz >= 2999.16, text >= 0.11 && < 0.12, prettyclass, fgl, HUnit, bytestring, parsec >= 3.1.0, vector >= 0.10, hierarchical-clustering >= 0.4, split >= 0.2 -- Additional depends for test: Build-Depends: HUnit, test-framework, test-framework-hunit, test-framework-th GHC-Options: -O2 -funbox-strict-fields -rtsopts -threaded if flag(hashrf) CPP-Options: -DUSE_HASHRF if flag(bitvec) CPP-Options: -DBITVEC_BIPS Build-Depends: bitvec >= 0.1 if flag(sequential) CPP-Options: -DSEQUENTIALIZE PhyBin-0.3/point_check.sh000077500000000000000000000002221221636131000153610ustar00rootroot00000000000000#!/bin/bash arg1=$1 arg2=$2 phybin -p2 -b0.02 -n10 --complete --editdist=0 --rfdist Wolbachia/trees/*.$arg1.codon Wolbachia/trees/*.$arg2.codon PhyBin-0.3/stripTable.hs000066400000000000000000000024461221636131000152130ustar00rootroot00000000000000 -- | This is a quick hack to split up a table in a tab-delimited file of -- the format Irene produces. The result is a simple two-column, -- white-space delimited table of the kind phybin expects. import System.Environment (getArgs) import Control.Monad import Data.List import Data.List.Split import System.IO isDataLine :: String -> Bool isDataLine l = case words l of ("Roundup":"Orthology":_) -> False (_:"results":"found":_) -> False ("Gene":"Cluster":_) -> False ("Id":"Genome":_) -> False [] -> False _ -> True main = do args <- getArgs let file = case args of [f] -> f _ -> error "Expects one argument!! [filename]" raw <- readFile file let lns = lines raw filt = filter isDataLine lns toks = map (splitOn ['\t']) filt put = hPutStrLn stderr put$"Read "++show (length lns)++" lines from file "++show file put$" "++show (length filt)++" contain data" put$" Distinct #toks found on data lines: " ++ show(nub (map length toks)) put$" A sample of ten parsed lines :" mapM_ (put . (" "++) . show) $ take 10 $ toks put$" Echoing columns one and two in space-separated form:" forM_ toks $ \ (one:two:_) -> putStrLn$ one ++" "++ two return () PhyBin-0.3/tests/000077500000000000000000000000001221636131000137025ustar00rootroot00000000000000PhyBin-0.3/tests/t1/000077500000000000000000000000001221636131000142265ustar00rootroot00000000000000PhyBin-0.3/tests/t1/112.tr000066400000000000000000000016701221636131000151040ustar00rootroot00000000000000((6_134_D-alanine--D-alanine_ligase_wRi_WRi_001550:0.00139828386866881348,7_88_D-alanine--D-alanine_ligase_wMel_WD0095:0.00292963228134226648)90:0.01074298822528597409,((1_177_D-alanine--D-alanine_ligase_family_protein_wUni-gwu_gwu_177:0.00000260053623247659,2_1056_D-alanine--D-alanine_ligase_family_protein_wVitA-gwv_gwv_1056:0.00000260053623247659)100:0.04173654858419943714,((3_584_D-alanine--D-alanine_ligase_wBm_Wbm0570:0.12287024242687821785,14_97_D-alanine--D-alanine_ligase_wOo_wOo_01490:0.21253388760532263002)97:0.04234133455314714745,(13_1000_D-alanine--D-alanine_ligase_wAlbB_WALBB_840007:0.02885810606015209653,(19_986_D-alanine--D-alanine_ligase_wNo_wNo_10410:0.01561278511031995164,5_356_D-alanine--D-alanine_ligase_wPip-Pel_WPa_0403:0.01463937257251070670)86:0.02031938276584665734)100:0.05030810227222898973)94:0.02689897886038826805)94:0.01431391837063179621,18_124_D-alanine--D-alanine_ligase_wHa_wHa_01330:0.00955811509725618819); PhyBin-0.3/tests/t1/13.tr000066400000000000000000000016731221636131000150270ustar00rootroot00000000000000(1_903_uncharacterized_protein_involved_in_outer_membrane_biogenesis_wUni-gwu_gwu_903:0.00040202748005558947,(6_829_hypothetical_protein_wRi_WRi_009630:0.01489729960416468282,(7_929_hypothetical_protein_wMel_WD1011:0.00503560768415985908,(18_792_hypothetical_protein_wHa_wHa_08450:0.00231874015436256235,((14_125_hypothetical_protein_wOo_wOo_01830:0.25811131256411451451,3_124_hypothetical_protein_wBm_Wbm0121:0.13916197071632291360)100:0.06927753636164095397,(19_321_hypothetical_protein_wNo_wNo_03430:0.01639317844516003167,(13_886_conserved_exported_hypothetical_protein_wAlbB_WALBB_720012:0.00326028007674698025,5_586_hypothetical_protein_wPip-Pel_WPa_0647:0.00287609557850956876)100:0.00447884253275358801)100:0.14806894458377672930)100:0.07526112562734856015)53:0.00345017766882451359)57:0.00824257687487229586)100:0.01917411927034791061,2_608_uncharacterized_protein_involved_in_outer_membrane_biogenesis_wVitA-gwv_gwv_608:0.00000343564414107816); PhyBin-0.3/tests/t2/000077500000000000000000000000001221636131000142275ustar00rootroot00000000000000PhyBin-0.3/tests/t2/116.tr000066400000000000000000000017001221636131000151030ustar00rootroot00000000000000(3_1_uroporphyrinogen-III_decarboxylase_wBm_Wbm0001:0.24700160468123241730,((((6_846_uroporphyrinogen_decarboxylase_wRi_WRi_009840:0.00000450221771908982,18_807_Uroporphyrinogen_decarboxylase_wHa_wHa_08610:0.00294819211409539598)36:0.00000450221771908982,7_945_uroporphyrinogen_decarboxylase_wMel_WD1028:0.00196333316748527244)98:0.00593311441869561872,(2_624_uroporphyrinogen_decarboxylase_wVitA-gwv_gwv_624:0.00000450221771908982,1_914_uroporphyrinogen_decarboxylase_wUni-gwu_gwu_914:0.00098133353413034177)93:0.00000450221771908982)100:0.10725155104593755984,(19_475_Uroporphyrinogen_decarboxylase_4_wNo_wNo_04990:0.09396104790396474227,(13_676_Uroporphyrinogen_decarboxylase_wAlbB_WALBB_560003:0.02605769262950026130,5_782_uroporphyrinogen_decarboxylase_wPip-Pel_WPa_0843:0.01487808114070891460)100:0.07194998566389951278)100:0.09684640413579020946)83:0.05289788934073481191,14_264_uroporphyrinogen-III_decarboxylase_wOo_wOo_04390:0.32961182986000586714); PhyBin-0.3/tests/t2/233.tr000066400000000000000000000021451221636131000151070ustar00rootroot00000000000000(5_540_TrbL/VirB6_plasmid_conjugal_transfer_family_protein_wPip-Pel_WPa_0601:0.02084954811541155431,(19_278_Type_IV_secretion_system_protein_VirB6_putative_wNo_wNo_02980:0.05379840855764397162,((3_826_Type_IV_secretory_pathway_VirB6_components_wBm_Wbm0795:0.13408854198340605657,14_87_type_IV_secretory_pathway_VirB6_components_wOo_wOo_01320:0.21280678010849515824)79:0.04287407238705213258,((2_331_trbL/VirB6_plasmid_conjugal_transfer_family_protein_wVitA-gwv_gwv_331:0.00000263655140029384,1_766_type_IV_secretory_pathway_VirB6_components_domain_protein_wUni-gwu_gwu_766:0.00215959042042796704)100:0.02767854354886365234,(7_790_hypothetical_protein_wMel_WD0856:0.00573582661300406794,(6_707_Type_IV_secretion_system_protein_VirB6_putative_wRi_WRi_008210:0.00000263655140029384,18_687_Type_IV_secretion_system_protein_VirB6_putative_wHa_wHa_07380:0.00126238582938514388)100:0.00149674265440276474)100:0.04389458921346768572)100:0.06584511674229984179)100:0.13903817081131625288)100:0.03080481163636052816,13_925_putative_component_of_the_Type_IV_secretory_pathway_TrlB/VirB6-like_wAlbB_WALBB_750005:0.00661142799150828565); PhyBin-0.3/tests/t30_mismatched/000077500000000000000000000000001221636131000165065ustar00rootroot00000000000000PhyBin-0.3/tests/t30_mismatched/mismatched.tr000066400000000000000000000002401221636131000211670ustar00rootroot00000000000000(2_, (((14, 3_), (19, (5_, 13))), (18, (6_, 7_))), 1_); (2_, (((14, 3_), (19, (5_, 13))), (6_, 7_)), 1_); (2_, (((14, 3_), (19, (5_, 13))), (18, 6_, 7_)), 1_); PhyBin-0.3/tests/t3_consensus/000077500000000000000000000000001221636131000163305ustar00rootroot00000000000000PhyBin-0.3/tests/t3_consensus/cluster1_284_alltrees.tr000066400000000000000000000370401221636131000227350ustar00rootroot00000000000000(3_, (((13, 5_), 19), ((2_, 1_), ((7_, 6_), 18))), 14); (2_, (((7_, 6_), 18), (((5_, 13), 19), (3_, 14))), 1_); (6_, (18, (((19, (5_, 13)), (3_, 14)), (2_, 1_))), 7_); ((7_, 6_), (((19, (5_, 13)), (14, 3_)), (1_, 2_)), 18); ((13, 5_), ((14, 3_), (((6_, 7_), 18), (1_, 2_))), 19); (2_, (((14, 3_), (19, (5_, 13))), (18, (6_, 7_))), 1_); ((5_, 13), ((14, 3_), ((18, (7_, 6_)), (2_, 1_))), 19); (14, ((((7_, 6_), 18), (2_, 1_)), ((5_, 13), 19)), 3_); (13, (((3_, 14), ((1_, 2_), (18, (6_, 7_)))), 19), 5_); ((13, 5_), ((((7_, 6_), (1_, 2_)), 18), (14, 3_)), 19); (5_, (19, ((18, ((6_, 7_), (2_, 1_))), (14, 3_))), 13); (1_, (((((5_, 13), 19), (14, 3_)), 18), (6_, 7_)), 2_); (((18, ((6_, 7_), (2_, 1_))), (3_, 14)), (5_, 13), 19); (((7_, 6_), (1_, 2_)), ((19, (5_, 13)), (14, 3_)), 18); (1_, ((((14, 3_), ((13, 5_), 19)), 18), (6_, 7_)), 2_); (6_, ((((19, (13, 5_)), (3_, 14)), 18), (1_, 2_)), 7_); (14, (((5_, 13), 19), (((2_, 1_), 18), (7_, 6_))), 3_); (5_, (19, ((((2_, 1_), 18), (7_, 6_)), (14, 3_))), 13); (2_, (18, (((14, 3_), (19, (5_, 13))), (7_, 6_))), 1_); ((((14, 3_), (19, (5_, 13))), (6_, 7_)), (1_, 2_), 18); (2_, (((((5_, 19), 13), (3_, 14)), 18), (7_, 6_)), 1_); (14, ((((1_, 2_), (6_, 7_)), 18), (13, (5_, 19))), 3_); (((3_, 14), (13, (5_, 19))), ((7_, 6_), (2_, 1_)), 18); (7_, ((1_, 2_), (18, ((3_, 14), (13, (19, 5_))))), 6_); (14, ((18, ((6_, 7_), (2_, 1_))), (13, (19, 5_))), 3_); (6_, ((((13, (5_, 19)), (3_, 14)), 18), (2_, 1_)), 7_); (3_, (((19, 5_), 13), (18, ((6_, 7_), (1_, 2_)))), 14); (((1_, 2_), (6_, 7_)), ((3_, 14), (13, (5_, 19))), 18); ((6_, 7_), ((1_, 2_), ((3_, 14), (13, (19, 5_)))), 18); (3_, ((13, (19, 5_)), ((18, (6_, 7_)), (2_, 1_))), 14); (19, ((((2_, 1_), (18, (6_, 7_))), (14, 3_)), 13), 5_); (7_, (18, ((1_, 2_), ((3_, 14), (13, (19, 5_))))), 6_); (14, (((18, (6_, 7_)), (2_, 1_)), (13, (19, 5_))), 3_); (6_, (18, ((2_, 1_), (((19, 5_), 13), (14, 3_)))), 7_); ((7_, 6_), ((1_, 2_), ((14, 3_), (13, (19, 5_)))), 18); (7_, (((13, (19, 5_)), (3_, 14)), (18, (2_, 1_))), 6_); (3_, (((18, (2_, 1_)), (7_, 6_)), (13, (5_, 19))), 14); (14, ((5_, (13, 19)), ((6_, 7_), ((2_, 1_), 18))), 3_); (13, (5_, ((3_, 14), ((18, 7_), (6_, (2_, 1_))))), 19); (3_, (((6_, (2_, 1_)), (18, 7_)), (5_, (13, 19))), 14); ((19, 13), (((6_, (1_, 2_)), (7_, 18)), (3_, 14)), 5_); (((7_, 18), ((14, 3_), (5_, (19, 13)))), (1_, 2_), 6_); (1_, (((18, 7_), 6_), ((5_, (19, 13)), (14, 3_))), 2_); (3_, (((13, 19), 5_), ((6_, (18, 7_)), (1_, 2_))), 14); (3_, (((1_, 2_), ((7_, 18), 6_)), (5_, (19, 13))), 14); (3_, ((5_, (13, 19)), ((1_, 2_), ((18, 7_), 6_))), 14); ((18, 7_), (((14, 3_), (5_, (13, 19))), (2_, 1_)), 6_); (7_, (6_, ((2_, 1_), (((13, 19), 5_), (14, 3_)))), 18); (((14, 3_), ((2_, 1_), ((7_, 18), 6_))), (13, 19), 5_); (18, ((2_, 1_), ((((19, 13), 5_), (14, 3_)), 6_)), 7_); (19, (5_, ((6_, ((1_, 2_), (7_, 18))), (3_, 14))), 13); (3_, ((6_, ((2_, 1_), (18, 7_))), (5_, (13, 19))), 14); (((2_, 1_), (7_, 18)), ((5_, (19, 13)), (14, 3_)), 6_); ((19, 13), ((6_, ((2_, 1_), (7_, 18))), (3_, 14)), 5_); (2_, ((7_, 18), (((3_, 14), ((19, 13), 5_)), 6_)), 1_); (13, (5_, ((6_, ((1_, 2_), (18, 7_))), (3_, 14))), 19); (14, ((5_, (13, 19)), (((2_, 1_), (18, 7_)), 6_)), 3_); (13, (5_, ((6_, ((1_, 2_), (18, 7_))), (3_, 14))), 19); (13, (5_, ((((2_, 1_), (18, 6_)), 7_), (14, 3_))), 19); ((19, 13), ((14, 3_), (7_, ((1_, 2_), (18, 6_)))), 5_); (2_, ((6_, 18), (7_, ((5_, (19, 13)), (3_, 14)))), 1_); (((18, 6_), (1_, 2_)), ((5_, (13, 19)), (3_, 14)), 7_); (13, (5_, ((14, 3_), (((1_, 2_), (6_, 18)), 7_))), 19); (((3_, 14), (7_, ((2_, 1_), (6_, 18)))), (19, 13), 5_); ((19, 13), ((7_, ((6_, 18), (1_, 2_))), (14, 3_)), 5_); (6_, ((((5_, (19, 13)), (14, 3_)), 7_), (2_, 1_)), 18); (6_, ((7_, ((3_, 14), (5_, (19, 13)))), (1_, 2_)), 18); (1_, (((5_, (13, 19)), (3_, 14)), (7_, (18, 6_))), 2_); ((19, 13), (((7_, (6_, 18)), (2_, 1_)), (3_, 14)), 5_); ((13, 19), (((2_, 1_), ((6_, 18), 7_)), (14, 3_)), 5_); (1_, (((5_, (13, 19)), (3_, 14)), (7_, (6_, 18))), 2_); (6_, (7_, (((14, 3_), (5_, (13, 19))), (2_, 1_))), 18); (1_, ((7_, (6_, 18)), ((5_, (19, 13)), (14, 3_))), 2_); (14, ((5_, (19, 13)), ((2_, 1_), ((6_, 18), 7_))), 3_); (1_, (((3_, 14), (5_, (13, 19))), (7_, (18, 6_))), 2_); ((13, 19), (((2_, 1_), (7_, (18, 6_))), (14, 3_)), 5_); (2_, (((5_, (13, 19)), (14, 3_)), (7_, (18, 6_))), 1_); (1_, (((6_, 18), 7_), ((3_, 14), (5_, (13, 19)))), 2_); (14, (((7_, (6_, 18)), (2_, 1_)), (5_, (13, 19))), 3_); (18, ((7_, (1_, 2_)), ((5_, (13, 19)), (3_, 14))), 6_); (((18, 6_), ((5_, (13, 19)), (3_, 14))), (2_, 1_), 7_); (1_, (7_, ((18, 6_), ((3_, 14), ((13, 19), 5_)))), 2_); (19, (5_, ((14, 3_), (((1_, 2_), 7_), (18, 6_)))), 13); (18, (((5_, (13, 19)), (14, 3_)), (7_, (1_, 2_))), 6_); ((1_, 2_), (((5_, (19, 13)), (14, 3_)), (18, 6_)), 7_); (2_, ((7_, 6_), (((3_, 14), ((13, 19), 5_)), 18)), 1_); (14, ((5_, (19, 13)), (((7_, 6_), (1_, 2_)), 18)), 3_); (19, (((14, 3_), (((2_, 1_), (6_, 7_)), 18)), 5_), 13); (2_, ((((5_, (19, 13)), (3_, 14)), 18), (7_, 6_)), 1_); ((13, 19), ((18, ((6_, 7_), (1_, 2_))), (3_, 14)), 5_); (2_, (((((14, 3_), ((19, 13), 5_)), 18), 6_), 7_), 1_); (((7_, (1_, 2_)), 6_), ((14, 3_), (5_, (13, 19))), 18); (1_, (7_, (6_, (18, ((5_, (13, 19)), (3_, 14))))), 2_); (((14, 3_), (18, ((7_, (1_, 2_)), 6_))), (13, 19), 5_); ((13, 19), ((18, (6_, (7_, (1_, 2_)))), (3_, 14)), 5_); (3_, ((((6_, 18), 7_), (2_, 1_)), (19, (13, 5_))), 14); (2_, ((((5_, 13), 19), (14, 3_)), (7_, (6_, 18))), 1_); (2_, (((3_, 14), ((5_, 13), 19)), ((6_, 18), 7_)), 1_); (1_, (((19, (13, 5_)), (3_, 14)), ((18, 6_), 7_)), 2_); (((2_, 1_), (((13, 5_), 19), (3_, 14))), (6_, 18), 7_); (5_, (19, ((3_, 14), ((2_, 1_), (7_, (6_, 18))))), 13); ((6_, 18), (((19, (13, 5_)), (14, 3_)), (1_, 2_)), 7_); (((2_, 1_), ((19, (13, 5_)), (14, 3_))), (6_, 18), 7_); (2_, (((19, (5_, 13)), (3_, 14)), ((6_, 18), 7_)), 1_); (5_, (((3_, 14), (((6_, 18), 7_), (1_, 2_))), 19), 13); (13, (19, (((2_, 1_), ((18, 6_), 7_)), (3_, 14))), 5_); (14, ((19, (5_, 13)), ((7_, (6_, 18)), (2_, 1_))), 3_); (5_, (19, ((3_, 14), ((1_, 2_), (7_, (18, 6_))))), 13); (5_, (19, ((14, 3_), ((1_, 2_), (7_, (18, 6_))))), 13); (18, (7_, ((1_, 2_), ((14, 3_), (19, (5_, 13))))), 6_); (5_, (19, (((1_, 2_), ((6_, 18), 7_)), (3_, 14))), 13); (14, ((19, (5_, 13)), (((2_, 1_), 7_), (18, 6_))), 3_); (14, ((19, (13, 5_)), ((7_, (2_, 1_)), (18, 6_))), 3_); (14, (((18, 6_), ((1_, 2_), 7_)), (19, (13, 5_))), 3_); (2_, (7_, ((((13, 5_), 19), (14, 3_)), (18, 6_))), 1_); (3_, (((5_, 13), 19), ((6_, 18), (7_, (2_, 1_)))), 14); (1_, (((18, 6_), ((19, (13, 5_)), (3_, 14))), 7_), 2_); (14, ((19, (13, 5_)), ((6_, 18), (7_, (2_, 1_)))), 3_); (18, ((7_, (2_, 1_)), ((3_, 14), (19, (13, 5_)))), 6_); (((3_, 14), ((18, 6_), (7_, (2_, 1_)))), (5_, 13), 19); ((5_, 13), ((14, 3_), (7_, ((2_, 1_), (6_, 18)))), 19); (3_, ((19, (13, 5_)), (7_, ((2_, 1_), (18, 6_)))), 14); (2_, ((7_, ((3_, 14), (19, (13, 5_)))), (6_, 18)), 1_); (6_, ((((3_, 14), ((5_, 13), 19)), 7_), (2_, 1_)), 18); (18, ((1_, 2_), (7_, ((14, 3_), (19, (5_, 13))))), 6_); (3_, ((19, (13, 5_)), (7_, ((2_, 1_), (6_, 18)))), 14); (2_, ((18, 6_), (7_, (((13, 5_), 19), (14, 3_)))), 1_); (1_, ((((14, 3_), (19, (5_, 13))), 7_), (6_, 18)), 2_); (6_, ((7_, ((19, (13, 5_)), (14, 3_))), (1_, 2_)), 18); ((5_, 13), ((3_, 14), (7_, ((2_, 1_), (18, 6_)))), 19); ((((((5_, 13), 19), (3_, 14)), 7_), 6_), (2_, 1_), 18); (13, (19, ((((18, (1_, 2_)), 6_), 7_), (3_, 14))), 5_); ((6_, (18, (1_, 2_))), ((3_, 14), (19, (13, 5_))), 7_); (1_, ((6_, (7_, ((14, 3_), (19, (5_, 13))))), 18), 2_); (1_, (18, (7_, (6_, ((3_, 14), (19, (5_, 13)))))), 2_); (((3_, 14), (19, (5_, 13))), (7_, (18, (1_, 2_))), 6_); (5_, (19, ((3_, 14), (6_, (7_, (18, (1_, 2_)))))), 13); (5_, (19, ((14, 3_), (6_, (7_, ((2_, 1_), 18))))), 13); ((5_, 13), ((6_, (((2_, 1_), 18), 7_)), (14, 3_)), 19); (5_, (19, (((((2_, 1_), 18), 7_), 6_), (14, 3_))), 13); ((5_, 13), (((7_, (18, (2_, 1_))), 6_), (3_, 14)), 19); ((5_, 13), ((3_, 14), (6_, ((1_, 2_), (7_, 18)))), 19); (1_, ((18, 7_), (((19, (5_, 13)), (14, 3_)), 6_)), 2_); (7_, ((6_, ((14, 3_), (19, (13, 5_)))), (1_, 2_)), 18); (5_, (19, ((3_, 14), (6_, ((1_, 2_), (18, 7_))))), 13); (3_, ((((18, 7_), (2_, 1_)), 6_), (19, (13, 5_))), 14); (5_, (19, ((6_, ((2_, 1_), (7_, 18))), (3_, 14))), 13); (3_, ((((18, 7_), (2_, 1_)), 6_), (19, (5_, 13))), 14); (((2_, 1_), (7_, 18)), ((14, 3_), (19, (5_, 13))), 6_); ((13, 5_), ((14, 3_), (6_, ((1_, 2_), (7_, 18)))), 19); (3_, ((19, (5_, 13)), (((1_, 2_), 6_), (18, 7_))), 14); (13, (19, (((7_, 18), (6_, (2_, 1_))), (14, 3_))), 5_); (7_, ((6_, (2_, 1_)), (((13, 5_), 19), (3_, 14))), 18); (3_, ((19, (5_, 13)), ((6_, (2_, 1_)), (18, 7_))), 14); (7_, ((6_, (1_, 2_)), ((3_, 14), (19, (5_, 13)))), 18); (1_, ((((19, (5_, 13)), (3_, 14)), (18, 7_)), 6_), 2_); (((14, 3_), (18, (7_, ((1_, 2_), 6_)))), (19, 5_), 13); (5_, (13, ((14, 3_), (18, ((6_, (2_, 1_)), 7_)))), 19); ((19, 5_), ((3_, 14), (18, (7_, ((1_, 2_), 6_)))), 13); ((7_, ((((19, 5_), 13), (14, 3_)), 18)), (1_, 2_), 6_); (19, (13, ((18, ((6_, (1_, 2_)), 7_)), (3_, 14))), 5_); ((1_, 2_), (7_, (18, ((14, 3_), (13, (5_, 19))))), 6_); (14, ((18, (7_, ((1_, 2_), 6_))), (5_, (19, 13))), 3_); (2_, (6_, ((((14, 3_), (5_, (19, 13))), 18), 7_)), 1_); ((2_, 1_), (7_, (18, ((5_, (19, 13)), (14, 3_)))), 6_); (((2_, 1_), 6_), (18, ((3_, 14), (5_, (13, 19)))), 7_); ((7_, ((1_, 2_), 6_)), ((3_, 14), (5_, (19, 13))), 18); (13, (((14, 3_), ((7_, (6_, (2_, 1_))), 18)), 19), 5_); (1_, (6_, (7_, (18, ((14, 3_), (19, (13, 5_)))))), 2_); (14, ((19, (5_, 13)), ((((1_, 2_), 6_), 7_), 18)), 3_); (2_, ((7_, (((19, (5_, 13)), (14, 3_)), 18)), 6_), 1_); ((18, ((3_, 14), ((5_, 13), 19))), (6_, (1_, 2_)), 7_); ((19, 13), ((7_, ((6_, (2_, 1_)), 18)), (3_, 14)), 5_); (((6_, (1_, 2_)), 18), ((14, 3_), (5_, (13, 19))), 7_); (14, (((((1_, 2_), 6_), 18), 7_), (5_, (13, 19))), 3_); (((((3_, 14), ((13, 5_), 19)), 7_), 18), (2_, 1_), 6_); ((13, 5_), ((7_, (18, (6_, (1_, 2_)))), (14, 3_)), 19); (5_, (19, ((7_, (18, ((2_, 1_), 6_))), (3_, 14))), 13); (13, (19, ((3_, 14), ((((2_, 1_), 6_), 18), 7_))), 5_); (2_, (6_, ((((19, (5_, 13)), (14, 3_)), 7_), 18)), 1_); (3_, (((5_, 13), 19), (7_, (18, ((1_, 2_), 6_)))), 14); (((6_, (1_, 2_)), 18), (((19, 5_), 13), (3_, 14)), 7_); ((((14, 3_), ((19, 5_), 13)), 7_), (6_, (1_, 2_)), 18); ((7_, ((13, (19, 5_)), (3_, 14))), (6_, (2_, 1_)), 18); (5_, (13, ((14, 3_), ((18, (6_, (1_, 2_))), 7_))), 19); (1_, (6_, (((((19, 5_), 13), (14, 3_)), 7_), 18)), 2_); (2_, (7_, (18, (6_, ((14, 3_), ((13, 5_), 19))))), 1_); (14, (((((2_, 1_), 7_), 18), 6_), (19, (13, 5_))), 3_); (14, ((19, (5_, 13)), (6_, ((7_, (2_, 1_)), 18))), 3_); (14, ((19, (5_, 13)), (6_, (18, (7_, (1_, 2_))))), 3_); (1_, (7_, (((((13, 5_), 19), (3_, 14)), 6_), 18)), 2_); ((18, (7_, (1_, 2_))), (((5_, 13), 19), (14, 3_)), 6_); ((7_, (1_, 2_)), (6_, ((19, (5_, 13)), (3_, 14))), 18); (5_, (19, ((6_, (18, (7_, (1_, 2_)))), (14, 3_))), 13); ((18, (((3_, 14), (5_, (19, 13))), 6_)), (1_, 2_), 7_); (1_, (7_, (18, (((5_, (19, 13)), (3_, 14)), 6_))), 2_); (1_, (7_, ((6_, ((5_, (13, 19)), (14, 3_))), 18)), 2_); (3_, ((6_, (18, ((2_, 1_), 7_))), (5_, (19, 13))), 14); (3_, ((5_, (19, 13)), (6_, (18, ((1_, 2_), 7_)))), 14); ((13, 19), ((6_, (18, (7_, (2_, 1_)))), (14, 3_)), 5_); (14, (((19, 13), 5_), ((18, ((2_, 1_), 7_)), 6_)), 3_); ((6_, (((13, 19), 5_), (3_, 14))), (7_, (1_, 2_)), 18); (14, ((6_, (18, ((2_, 1_), 7_))), (13, (5_, 19))), 3_); (2_, (7_, (18, (((13, (5_, 19)), (3_, 14)), 6_))), 1_); ((1_, 2_), (18, (((3_, 14), (13, (19, 5_))), 6_)), 7_); (((13, (19, 5_)), (14, 3_)), (18, (7_, (2_, 1_))), 6_); (3_, ((((7_, (2_, 1_)), 18), 6_), (13, (5_, 19))), 14); (14, ((6_, (((1_, 2_), 7_), 18)), (13, (19, 5_))), 3_); (((1_, 2_), 7_), (6_, ((14, 3_), ((19, 5_), 13))), 18); ((((13, (5_, 19)), (14, 3_)), 6_), (7_, (2_, 1_)), 18); ((5_, 19), ((6_, (18, ((7_, 2_), 1_))), (3_, 14)), 13); (2_, (1_, (18, (((13, (19, 5_)), (14, 3_)), 6_))), 7_); ((18, (1_, (7_, 2_))), ((14, 3_), (13, (19, 5_))), 6_); (7_, ((18, (((14, 3_), (13, (5_, 19))), 6_)), 1_), 2_); (1_, (2_, (18, (6_, ((13, (19, 5_)), (14, 3_))))), 7_); (18, ((1_, 2_), (6_, (((19, 5_), 13), (3_, 14)))), 7_); ((7_, ((2_, 1_), 18)), (((5_, 19), 13), (3_, 14)), 6_); (19, (13, ((6_, (7_, (18, (2_, 1_)))), (3_, 14))), 5_); ((19, 5_), ((6_, (7_, (18, (1_, 2_)))), (3_, 14)), 13); (5_, (13, ((14, 3_), (6_, (7_, (18, (1_, 2_)))))), 19); (19, (13, (((7_, (18, (2_, 1_))), 6_), (3_, 14))), 5_); ((7_, ((1_, 2_), 18)), ((13, (19, 5_)), (3_, 14)), 6_); (((14, 3_), ((18, 6_), (7_, (2_, 1_)))), (19, 5_), 13); (6_, ((7_, (2_, 1_)), ((13, (19, 5_)), (3_, 14))), 18); (((3_, 14), (((1_, 2_), 7_), (18, 6_))), (5_, 19), 13); ((5_, 19), (((18, 6_), (7_, (2_, 1_))), (14, 3_)), 13); (5_, (((14, 3_), ((6_, 18), (7_, (2_, 1_)))), 13), 19); (3_, (((7_, (2_, 1_)), (6_, 18)), (13, (19, 5_))), 14); ((5_, 19), (((6_, 18), (7_, (1_, 2_))), (3_, 14)), 13); (5_, (13, ((7_, (2_, 1_)), ((6_, 18), (14, 3_)))), 19); (18, (((13, (5_, 19)), (7_, (2_, 1_))), (3_, 14)), 6_); (1_, (7_, ((14, 3_), (6_, (18, (13, (19, 5_)))))), 2_); (19, (13, (18, (((14, 3_), 6_), (7_, (2_, 1_))))), 5_); ((6_, ((14, (3_, (13, (5_, 19)))), 18)), (2_, 1_), 7_); (2_, (7_, (6_, ((14, (3_, (13, (19, 5_)))), 18))), 1_); ((6_, (18, ((14, 3_), (13, (19, 5_))))), (1_, 2_), 7_); (5_, (13, (((((1_, 2_), 7_), 6_), 18), (3_, 14))), 19); ((6_, (7_, (2_, 1_))), ((3_, 14), (13, (19, 5_))), 18); ((7_, (1_, 2_)), (18, ((14, 3_), ((5_, 19), 13))), 6_); (14, ((13, (5_, 19)), (18, (6_, (7_, (1_, 2_))))), 3_); ((7_, (1_, 2_)), (18, ((13, (19, 5_)), (3_, 14))), 6_); (2_, (7_, ((18, ((13, (5_, 19)), (3_, 14))), 6_)), 1_); (2_, (((18, (((5_, 19), 13), (14, 3_))), 6_), 7_), 1_); (1_, (7_, ((18, (3_, (14, (13, (19, 5_))))), 6_)), 2_); ((19, 5_), (3_, (14, (18, ((2_, 1_), (6_, 7_))))), 13); (18, (6_, (((3_, 14), ((5_, 19), 13)), (1_, 2_))), 7_); ((((13, (5_, 19)), (3_, 14)), (2_, 1_)), (7_, 18), 6_); ((((14, 3_), ((19, 5_), 13)), (1_, 2_)), (18, 7_), 6_); (7_, (6_, ((1_, 2_), ((3_, 14), ((5_, 19), 13)))), 18); (2_, ((6_, (18, 7_)), ((14, 3_), (13, (19, 5_)))), 1_); (18, (6_, (((3_, 14), ((19, 5_), 13)), (1_, 2_))), 7_); (14, (((19, 5_), 13), ((2_, 1_), ((7_, 18), 6_))), 3_); (2_, ((7_, (6_, 18)), ((14, 3_), (13, (19, 5_)))), 1_); (19, ((((2_, 1_), (7_, (6_, 18))), (3_, 14)), 13), 5_); ((18, 6_), (((13, (19, 5_)), (14, 3_)), (2_, 1_)), 7_); (2_, (((14, 3_), ((19, 5_), 13)), (7_, (18, 6_))), 1_); ((5_, 19), ((3_, 14), (((6_, 18), 7_), (2_, 1_))), 13); (6_, (7_, (((3_, 14), (13, (5_, 19))), (2_, 1_))), 18); ((((5_, 19), 13), 14), ((1_, 2_), (18, (7_, 6_))), 3_); (((((13, (19, 5_)), 14), 3_), (2_, 1_)), (7_, 6_), 18); ((6_, 7_), ((3_, (14, (13, (19, 5_)))), (1_, 2_)), 18); (((3_, ((13, (5_, 19)), 14)), (1_, 2_)), (6_, 18), 7_); (2_, (((14, (13, (19, 5_))), 3_), ((6_, 18), 7_)), 1_); ((18, 7_), ((3_, (14, (13, (5_, 19)))), (1_, 2_)), 6_); ((5_, 19), (3_, (((1_, 2_), ((7_, 18), 6_)), 14)), 13); ((6_, 18), ((2_, 1_), (14, (3_, (13, (19, 5_))))), 7_); ((13, (5_, 19)), (14, (7_, (18, ((2_, 1_), 6_)))), 3_); ((((3_, 14), 7_), (18, ((1_, 2_), 6_))), (5_, 19), 13); (14, (18, (((19, 5_), 13), (7_, (6_, (2_, 1_))))), 3_); (6_, (2_, ((7_, ((13, (19, 5_)), (14, 3_))), 18)), 1_); (19, (13, ((18, (7_, ((6_, 1_), 2_))), (14, 3_))), 5_); (1_, (((18, ((13, (5_, 19)), (3_, 14))), 2_), 6_), 7_); (7_, ((((5_, 19), 13), (3_, 14)), (18, (6_, 2_))), 1_); (19, (13, ((((6_, (18, 1_)), 7_), 2_), (14, 3_))), 5_); (((((14, 3_), (13, (19, 5_))), 2_), 6_), (18, 1_), 7_); (19, (((14, 3_), ((1_, ((18, 2_), 6_)), 7_)), 13), 5_); (3_, ((1_, (7_, ((18, 2_), 6_))), (13, (19, 5_))), 14); (14, ((7_, ((1_, 2_), (6_, 18))), (13, (19, 5_))), 3_); (19, (((((6_, 18), (2_, 1_)), 7_), (3_, 14)), 13), 5_); (((14, 3_), (13, (19, 5_))), ((2_, 1_), (6_, 18)), 7_); (14, ((13, (5_, 19)), (7_, ((6_, 18), (1_, 2_)))), 3_); (5_, (((7_, ((6_, 18), (1_, 2_))), (3_, 14)), 13), 19); (5_, (13, ((7_, ((2_, 1_), (18, 6_))), (14, 3_))), 19); (3_, (((2_, 1_), (6_, 18)), ((13, (19, 5_)), 7_)), 14); PhyBin-0.3/tests/t3_consensus/cluster1_284_consensus.tr000066400000000000000000000000541221636131000231350ustar00rootroot00000000000000((19, 5_, 13), 14, 3_, 18, 6_, 7_, 1_, 2_); PhyBin-0.3/tests/t4_consensus/000077500000000000000000000000001221636131000163315ustar00rootroot00000000000000PhyBin-0.3/tests/t4_consensus/cluster1_16_alltrees.tr000066400000000000000000000016001221636131000226400ustar00rootroot00000000000000(3_, ((((6_, 18), 7_), (2_, 1_)), (19, (13, 5_))), 14); (2_, ((((5_, 13), 19), (14, 3_)), (7_, (6_, 18))), 1_); (2_, (((3_, 14), ((5_, 13), 19)), ((6_, 18), 7_)), 1_); (1_, (((19, (13, 5_)), (3_, 14)), ((18, 6_), 7_)), 2_); (((2_, 1_), (((13, 5_), 19), (3_, 14))), (6_, 18), 7_); (5_, (19, ((3_, 14), ((2_, 1_), (7_, (6_, 18))))), 13); ((6_, 18), (((19, (13, 5_)), (14, 3_)), (1_, 2_)), 7_); (((2_, 1_), ((19, (13, 5_)), (14, 3_))), (6_, 18), 7_); (2_, (((19, (5_, 13)), (3_, 14)), ((6_, 18), 7_)), 1_); (5_, (((3_, 14), (((6_, 18), 7_), (1_, 2_))), 19), 13); (13, (19, (((2_, 1_), ((18, 6_), 7_)), (3_, 14))), 5_); (14, ((19, (5_, 13)), ((7_, (6_, 18)), (2_, 1_))), 3_); (5_, (19, ((3_, 14), ((1_, 2_), (7_, (18, 6_))))), 13); (5_, (19, ((14, 3_), ((1_, 2_), (7_, (18, 6_))))), 13); (18, (7_, ((1_, 2_), ((14, 3_), (19, (5_, 13))))), 6_); (5_, (19, (((1_, 2_), ((6_, 18), 7_)), (3_, 14))), 13); PhyBin-0.3/tests/t4_consensus/cluster1_16_consensus.tr000066400000000000000000000000701221636131000230450ustar00rootroot00000000000000((((5_, 13), 19), (14, 3_)), ((18, 6_), 7_), (1_, 2_)); PhyBin-0.3/tests/t5_consensus/000077500000000000000000000000001221636131000163325ustar00rootroot00000000000000PhyBin-0.3/tests/t5_consensus/cluster1_35_alltrees.tr000066400000000000000000000036501221636131000226510ustar00rootroot00000000000000(3_, ((((6_, 18), 7_), (2_, 1_)), (19, (13, 5_))), 14); (2_, ((((5_, 13), 19), (14, 3_)), (7_, (6_, 18))), 1_); (2_, (((3_, 14), ((5_, 13), 19)), ((6_, 18), 7_)), 1_); (1_, (((19, (13, 5_)), (3_, 14)), ((18, 6_), 7_)), 2_); (((2_, 1_), (((13, 5_), 19), (3_, 14))), (6_, 18), 7_); (5_, (19, ((3_, 14), ((2_, 1_), (7_, (6_, 18))))), 13); ((6_, 18), (((19, (13, 5_)), (14, 3_)), (1_, 2_)), 7_); (((2_, 1_), ((19, (13, 5_)), (14, 3_))), (6_, 18), 7_); (2_, (((19, (5_, 13)), (3_, 14)), ((6_, 18), 7_)), 1_); (5_, (((3_, 14), (((6_, 18), 7_), (1_, 2_))), 19), 13); (13, (19, (((2_, 1_), ((18, 6_), 7_)), (3_, 14))), 5_); (14, ((19, (5_, 13)), ((7_, (6_, 18)), (2_, 1_))), 3_); (5_, (19, ((3_, 14), ((1_, 2_), (7_, (18, 6_))))), 13); (5_, (19, ((14, 3_), ((1_, 2_), (7_, (18, 6_))))), 13); (18, (7_, ((1_, 2_), ((14, 3_), (19, (5_, 13))))), 6_); (5_, (19, (((1_, 2_), ((6_, 18), 7_)), (3_, 14))), 13); (14, ((19, (5_, 13)), (((2_, 1_), 7_), (18, 6_))), 3_); (14, ((19, (13, 5_)), ((7_, (2_, 1_)), (18, 6_))), 3_); (14, (((18, 6_), ((1_, 2_), 7_)), (19, (13, 5_))), 3_); (2_, (7_, ((((13, 5_), 19), (14, 3_)), (18, 6_))), 1_); (3_, (((5_, 13), 19), ((6_, 18), (7_, (2_, 1_)))), 14); (1_, (((18, 6_), ((19, (13, 5_)), (3_, 14))), 7_), 2_); (14, ((19, (13, 5_)), ((6_, 18), (7_, (2_, 1_)))), 3_); (18, ((7_, (2_, 1_)), ((3_, 14), (19, (13, 5_)))), 6_); (((3_, 14), ((18, 6_), (7_, (2_, 1_)))), (5_, 13), 19); ((5_, 13), ((14, 3_), (7_, ((2_, 1_), (6_, 18)))), 19); (3_, ((19, (13, 5_)), (7_, ((2_, 1_), (18, 6_)))), 14); (2_, ((7_, ((3_, 14), (19, (13, 5_)))), (6_, 18)), 1_); (6_, ((((3_, 14), ((5_, 13), 19)), 7_), (2_, 1_)), 18); (18, ((1_, 2_), (7_, ((14, 3_), (19, (5_, 13))))), 6_); (3_, ((19, (13, 5_)), (7_, ((2_, 1_), (6_, 18)))), 14); (2_, ((18, 6_), (7_, (((13, 5_), 19), (14, 3_)))), 1_); (1_, ((((14, 3_), (19, (5_, 13))), 7_), (6_, 18)), 2_); (6_, ((7_, ((19, (13, 5_)), (14, 3_))), (1_, 2_)), 18); ((5_, 13), ((3_, 14), (7_, ((2_, 1_), (18, 6_)))), 19); PhyBin-0.3/tests/t5_consensus/cluster1_35_consensus.tr000066400000000000000000000000661221636131000230540ustar00rootroot00000000000000((((5_, 13), 19), (14, 3_)), (1_, 2_), (18, 6_), 7_); PhyBin-0.3/vary_branchlen.sh000077500000000000000000000023421221636131000160750ustar00rootroot00000000000000#!/bin/bash set -e if [ "$1" == "" ]; then echo "ERR: Two args required!"; exit 1; fi if [ "$2" == "" ]; then echo "ERR: Two args required!"; exit 1; fi taxa=$1 dataset=$2 # 12 settings: branchlens="0.0001 0.001 0.01 0.02 0.03 0.05 0.1 0.2 0.3 0.5 1.0 5.0" DOGRAPH="-g" echo Wiping "$dataset/phybin_output/" ... rm -rf "$dataset/phybin_output/" echo "[parallel] Starting jobs..." for bl in $branchlens; do echo "Running with branch len $bl" echo "==============================" PHYBIN="./phybin.exe -b $bl $DOGRAPH -n $taxa" outdir="$dataset/phybin_output/brlen$bl/" LOG="$dataset/phybin_output/brlen$bl.log" mkdir -p $outdir # ASSUMPTION1! We need the renaming table at a particular spot: # ASSUMPTION2! We likewise expect the inputs in a certain spot. RENAMER="$dataset/renaming_table.txt" echo " RUNNING: $PHYBIN -m $RENAMER -s '_' -o $outdir" ($PHYBIN -m $RENAMER -s '_' -o $outdir $dataset/final_trees/*BranchLab*.out | tee $LOG) & done echo "[parallel] Waiting for outstanding jobs" FAIL=0 for job in `jobs -p` do echo $job wait $job || let "FAIL+=1" done if [ "$FAIL" == "0" ]; then echo "All jobs completed successfully" else echo "ERROR some jobs ($FAIL) failed!" exit 1 fi PhyBin-0.3/website/000077500000000000000000000000001221636131000142025ustar00rootroot00000000000000PhyBin-0.3/website/trees.jpg000066400000000000000000001167311221636131000160370ustar00rootroot00000000000000JFIFHHICC_PROFILEapplmntrRGB XYZ  &acspAPPL-appldescPbdscmBcprtwtptrXYZgXYZbXYZrTRC aarg $ vcgt DndinX>chad,mmod(bTRC gTRC aabg $ aagg $ descDisplaymluc nlNLdaDKplPLenUS,nbNO>frFRPptBRfptPT~zhCN esESjaJPruRU$svSEzhTWdeDEfiFIitIT"koKR 6Kleuren-LCDLCD-farveskrmKolor LCDColor LCDFarge-LCDLCD couleurLCD ColoridoLCD a Cores_ir LCDLCD color000 LCD&25B=>9 -48A?;59Frg-LCD_irmfvoy:VhFarb-LCDVri-LCDLCD colori LCDtextCopyright Apple, Inc., 2010XYZ RXYZ d3XYZ j_XYZ ' curv #(-26;@EJOTY^chmrw| %+28>ELRY`gnu| &/8AKT]gqz !-8COZfr~ -;HUcq~ +:IXgw'7HYj{+=Oat 2FZn  % : O d y  ' = T j " 9 Q i  * C \ u & @ Z t .Id %A^z &Ca~1Om&Ed#Cc'Ij4Vx&IlAe@e Ek*Qw;c*R{Gp@j>i  A l !!H!u!!!"'"U"""# #8#f###$$M$|$$% %8%h%%%&'&W&&&''I'z''( (?(q(())8)k))**5*h**++6+i++,,9,n,,- -A-v--..L.../$/Z///050l0011J1112*2c223 3F3334+4e4455M555676r667$7`7788P8899B999:6:t::;-;k;;<' >`>>?!?a??@#@d@@A)AjAAB0BrBBC:C}CDDGDDEEUEEF"FgFFG5G{GHHKHHIIcIIJ7J}JK KSKKL*LrLMMJMMN%NnNOOIOOP'PqPQQPQQR1R|RSS_SSTBTTU(UuUVV\VVWDWWX/X}XYYiYZZVZZ[E[[\5\\]']x]^^l^__a_``W``aOaabIbbcCccd@dde=eef=ffg=ggh?hhiCiijHjjkOkklWlmm`mnnknooxop+ppq:qqrKrss]sttptu(uuv>vvwVwxxnxy*yyzFz{{c{|!||}A}~~b~#G k͂0WGrׇ;iΉ3dʋ0cʍ1fΏ6n֑?zM _ɖ4 uL$h՛BdҞ@iءG&vVǥ8nRĩ7u\ЭD-u`ֲK³8%yhYѹJº;.! zpg_XQKFAǿ=ȼ:ɹ8ʷ6˶5̵5͵6ζ7ϸ9к<Ѿ?DINU\dlvۀ܊ݖޢ)߯6DScs 2F[p(@Xr4Pm8Ww)Kmparaff Y vcgt,RQN&A> q 9  );Rl3eX!*"a#$%'(>)`*z+,-./0123{4c5:66789r:K;&;<=>?j@FA#BBCDEFGnHUI;J!KKLMNO|PcQQR=S+TUUVWXYZ[o\W]A^'_ _`abcddeIf.gghijklrmRn2oopqrsmtJu(vvwxy]z/{{|}x~Kāj< ݅~N\,؎1ZΖ ՟Š~tiYD,گʰӸAnƖǷ.?MW_dhkmpݑe_4^T9 1Pw!i~m hz 4  $@` !"#$%&'(u)S*+*+,-_."./0O1 123<345r6/678f9%9:;Z<<=>Q??@A^B&BCDEPFFGH}IHJJKLuM@N NOPqQ=R RSTUSV%VWXYlZ>[[\]^U_%_`abhc8ddef{gMhhijk^l,lmnogp5qqrslt9uuvwfx1xyz{[|%|}~N(΃t`N;(֎yoe\ULB8/& 2I^r̰ر8lѹ3dǾ,e)ŐXɺ{;НӒpI)"H߮r!2K{2lBP&,Q O  pM+~udJ, v!E""#$J%%&d''(V()*/*+k, ,-G-./(/0i1 12S234E4567678(89r::;b<<=L=>???@ACABCTDDEpF$FGH9HIJLJKL\M MNoO OPQGRRS~T:TUVoW*WXY]ZZ[\C\]^r_._`ajb*bcdme-efgkh*hijfk$klm`nnop\qqrs]ttuv`w wxyaz z{|^}!}~vC炾jR;# ܋Č}lgdbabbcfhkor|աBnͨ2oX 2۾ªCndin6$UR&-P T9O\7Y}9bEmAn)Z&[ L\D M  w Q M T e=q C&w$>p*h - !"#`$4% %&'(){*i+Y,L-A.9/4001/21344>5P6e7}89:<=+>Y?@ACEDEG@HIKVLN OPRkSUbVXhY[]^`>ac1dXefghj+k`lmo4pqsqtvxy{<|~]R􉦋e'鐬n-ꗤ[ŞnA馘P֭rE޶e󹉻(ӾO ͜j>jM-rg20TL8W^.t%Lq 6b 5aO W>{N7A z h n . i [m%j0Z){Y; !!"#$%&'()*+,-./1203N4r5679$:Y;<> ?O@AC.DEG7HIKcLN@OQ+RT0UWEXZj\]_C`bdeghj km#np+qs>tvdwy{,|~]&\Bꎕ@퓚GL:̢b?窔Di%ն\q#QĆ#_̔_3ӡֆtjfdd1J_fdX=hWxwg3V:V*_Cr:p3g D8xC.:s g y _  J~T- !"#.$F%_&{'()+*,d-.001}245c679M:<=a>@1ACDFGIJL%MO9PR]SUW=XZ\H^_a|c@efhjnlDnoqsuwy{}u^A x<ΓmAО]٥Y߬j2ȵEa]ĢBɃ%UifJسy/߇4;^|sCq>B< s75!f5z(_?s&S3Vz+Nqsf32 B&lmmod1ExifMM*JR(iZHHҠC      C  " }!1AQa"q2#BR$3br %&'()*456789:CDEFGHIJSTUVWXYZcdefghijstuvwxyz w!1AQaq"2B #3Rbr $4%&'()*56789:CDEFGHIJSTUVWXYZcdefghijstuvwxyz ?袊((((((((((((((/' !#xK1ծ/B{F2]^aW89  ?(scu?B O'?h*??'S*?  ?(scu?B O'?h*??'S*?  ?(scu?B O'?h*??'S*?  ?(scu?B O'?h*??'S*?  ?(scu?B O'?h*??'S*?  ?(scu?B O'?h*??'S*?  ?(scu?B O'?h*??'S*?  ?(scu?B O'?h*??'S* IxOǯ/ıC:؝[6`Ox[8 QEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQE_Z_{l'WrֿQEQEQEQEQX3|;_Oҭ{0Od%I=eFH6tUxOn/mu IFRky؊@Q@Q@Q@Q@Q@Q@Q@Q@Q@~ 믊?_+c(ZE?4QEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQEQE`l'WrֿG (1\>5QEQEQE,M:׬8>kfԴ5}>{MN$6Xe]X`=&><{FgW`[%q8HPX$#_߉W6?s2OtKc˶Nq5!o-g?9g%(Mm2HTI -[O_>-Fx;QʲE0H'HAk25|f(;(`DG-Pw+*+*MCǎM ZU{eV BQEQEQE|E<)m7oAUk/L *JO5? |!?RFs^K3jEJHXl ygG!TWF7Hw5;]JN0&6U>ƾ?~'h}j5flA"˫FwhΎ CFцǯi>)xs~u ;=&QǨ[ $H`@>((((>cA_uG+ڿ~H(((((((((((((((((((((((((((((  Zu޾a??떵F(((((CG xĺZyr'TnGj VtXdG0Z6g2HOF p3xW[ mc ip5xTpW?>C%@;+~^0cNף$]JXTgW[rc0˖< o0x'[Phom>#N`/>nT",=:÷_-ƛo&k"1dw$1i[v>kF%m= ݵY8(~i_܋{2*[Gg+O#ݡE.rt 4i5 ۛmBhӬuG%vج?`0@$P_Yt=||V˦]^QR>x3o{+Qi/]HMfXu -'h(I!jy\}_u:w`q.If~bScoɖWiw_EQn0Ob~fAǎMzOْ6~#siq{&Y5@  $dᇵQ@Q@Q@Q@Q@~ 믊?_+c(ZE?4QEQEQEQEQEQEQEQEQEQEQEW]~?kٟ|_nF&xǫ_VlԚW}ji?Ϗ 8!kmDd1{?g5C?^ '>+d3F=f#eC_s~ϟNo>xGD ~bP_|/??@ɞ\i7mӠ : =k8ORwe((((((((((((((((a??떵F?OAQ((((((Oa7xñK//lbR[TTG9 SN9A$r#WR2#"#px]GIpOlqXmu:_H,־-E"0@N0 ev]!T<!<)<nlڷ&OX H\$@8>x'Kj%[[KkK˟^I3C&Y2*F W-淆uAH$K*$̄sڬ.K($b+8ϑlF8F[^ ( ( ( ( ( ( ?1_-"}'W]|Q~h(((((((/7k_Hg.dlYe&}.€<Ҋ/ŵ-k? eǛ Ub۟%x^|/.~6^-~y\fiGuNﲱSM[$qDF=+L~&_,״6khx. %d{W+C[2X xD}7L?g Jy<^@gW_W[2ika2 R??rϿ@{~Kyc{5(:C/m!UC7M4a[+t!言(AW袀 (?/8ORwe+8ORwe((((((((((((((((a??떵F?OAQ((((((u-6Xա6hP*0V$b>}5>7[-w^ƶyt!-F|($Q2d#$3{xyu}@Wyq */ʣܒr|U% ;*(((((((>cA_uG+ڿ~H(((( h_wjW >ç1~_s~Ͽj_/$_o+igaXOT~bIY$=(򢴼'_Ǻ:_tKZN"{>~mZHno>n$efO0s߭}oh_%mFweWq9&?P~Fc1}6] 7C߳iyW_?=OSl,jj(4^DWKRkhkq-E&?T};mmpƑCDE `ڟEW7?D? H{y/ooe=l,K6DP*矉?ZQo^2X7#r?[ԅz˿ߴ9(|.eLzۇ]hܣ$lUцHCL((((((((((  Zu޾a??떵F(((((((#_|`WB޸}g_ λz(((((((+'W]|Q>cA_uG((K{]K>= ]g![š&ߐ1tVZMΕhڌ?-odQa=VmQEQEQEQEQE`l'WrֿG (1\>5QEQEQEQEQEQEQEq,jv~>(1㞼jn#硠M#8Uu,3E: ( ( ( ( ( ( ?1_-"}'W]|Q~h+?77R6WÕ)i}^Zn(Y(((<du>? \_pj7\rS\>}ltM34HW`A|>sAS!4)%-[L=0A +7]uu⿳Go$%LL« T`cSt\ɮ[7!V' .L*27d +]tKėZZwv tgJ~2h~׬U㬵%-!GHbJf Ux}m;öIM…ޱx xx-ln-C$N(tC#QAU h ( ( ( (?/8ORwe+8ORwe((4Gn ~?p.V?g((((((S/< f6-Fr|\ fo=mgړ_Amg&23B E5ڍwP$R?J?[,4q&fvu f?U { 㙦[=V1+*6bLy2ܼ{*|_/xdwFUR9kjů |_ Cs\gLd޿y0%^Ef$XGj(2_6g◇5 _AtlbUSW=TA;źT*g)H4(f|L2Hw4=Ѧfi,Z&crs޾]o^"_>\KyghhZ~._Uw[#nZ}Ɛb7 ;Aſ gU.&cq+(3Q @y&|[o+8k:EŃdS\`l'WrֿG (1\>5QEQEQEQEQEQEQEx/SxgtN02$h:qrj?ئOCz"񖧤Mw>"a D#]FKv)wT0 q!P@o;Uv|7$+?xV嶳խZH"KE}W/є ]y>EQEQEQEQEQEQEWO1 ُi_?|f?PEP_q򖟇KOq@Q@Q@Q@Q@q>9Va6Zu7 (;[+bj|Nq {p?tTQEQEQEQEQEQE2_pa_Wpa_PEP_h'˟D?\ܭ~QEQEQEQEQEQEoYxói眱JUHd1ʺ0A A$qh.7JR@(=:}Eyڏ$![W2d04J %}Bm|3gw*tPzoERynw(Ǿ.ONS11bGbI$u5,* 8WACE. Udجgb 6m⽷Ě5 x1*WRAek$4"kXc{ss>|тV?5QEQEQEQEQEQEQEq.jһz<]%?vQEQEQEQEQEQEP|mpBSq>*FᏊ75xk^ۋm."K_݀v hkV𞢶PԴȗK]lY*eBB3F!="[k̚^ڍ-{"32Ģ4‡dkSwh{O]MJ?&4ڙue c+%gҏG;?wl` MZխEi׭M= 46qCݣ.s!;"s^xM-g[$kɌ@'*O {2LJk{"i vCQ Y^"'3>a+ω?|/ x"ԘUM O,_?|f?W~ 믊?@Q@}Z~זۊrs-?WmK4QEQEQEQE0[F4 ʹn8\})?WI\׍N<_}JaW?@-Q@Q@Q@V/<{aCvea$awKq!$3@TWj|QNͧisrAKo1љzX-7LϺ{OA𝦯  taue>/ [ ¾/ [ € ( D?\ܭ~0#7 rhz(((((((,^!&ʬph"XAPo5+:ʽH$+ :璮PVVEkA# @uz0? c|D·gELTYkS'8^TaE`v!-Hc0UFP :7z^s-:x_AW8(PU O ( ( .(?nKC\%'Mza??떵F?OAQhw5?i&ѷ-ܡ{k]o0_6F*7E"57f/m**KMIRHF*0_nQ@:I[Mux<ͫHWvG4|ؒھ:6HTOx¬?g  ocsn @(E.>#xg_ъJm}D/[4 4Qpϣr@QE|Qdž,4 ω5 ( Jw1sYx/÷V1e|p;9f'(׊|'W W-54]u}A76Hgt?m1iIs&^f&fbO6ȇ퍡='BMCɄ$w p%, `$d!._tߋmS sQ`; $o_i dC=q:Z( V:ãjvM3HյM,|!w4{Xwt<Q""$`1#q~ ƥ!@IK {%'\_E'aRsu[]7WRd*LMmEA ʋ^w|Ci jEP.ZLAj7H@H[;X˲ԁ֏<i㑦jy]W}MnWrG|`H9oNUWx4]^kƼL(~ >-ӣބZ˝˩۲6  J#U[CвKwhxceQ,3ı%brORjsp'B~#^IdW)6j:%J9(td*=$N:3(__u/[M-ɏNʗɛM?gu > ?4Z{=6+;{j-5rm)3r1 2l1;`IHH2qlQE2_pa_Wpa_PEP_h'˟D?\ܭ~QE2;H[Hwv RI(WqHxd]4zT)*gwǺ*}?ȊO,Bxc$GhX:yw((((( }y7B?ݚf,?lW)q(Qh#l9ʞF9ؐ`rRĀTrO@iuO^9 /jJ!6c@`6GUυxFAwQ:bDWCEp𪵟!WӥX+JJ0ýv␱81-z¿ j1_sA+˓ypXDn:>'8T~P?'𷇵)4 x~Q<$C*=O5 i"%vg˶`\jRrNXI4Jcjdv}vIFpV-/R쏵3\faMyP>cᇍIMl[}>G}27RH#xKSNt1#%n'#ʳy*ÏNҴ}LbH-m"Xa4Q+@o_Xk~0\%-R#hmlƺ((((((d?K¾0?K¾0(gO?+_5=ܹZ2"be}*EUTn#S n5QEQEQEQEQXmK~4jI'&w9q߅lSfE *I+~|TM t euuqs#]Or%⺊#e7}˴O=s~;񤺅Λ SKgk)Z514ĈJ<@l0 e^Jpz>s]a|.;ះtLVmfG29Ĩ73Xrkv ( ( ( ( ( ( ( ( ?1_-"}'W]|Q~h+?77R6WÕ)i}^Zn(Y(((Lmȇ S0>6eI|Y 꺞l쵔 Pu&AgƟnt _M__Dk5y.2U`z(-m8mP=%PEPEPEPEPEPEP% _.L} _.LQE=ܹZa4GnQ@Q@qO{ԓÖ́3e "]OnG>9յ;k xsQㄼt<ז5dСRʒg@ˀO5V;Ρ`,ߋ"_QݽzJ( 3T֬"!wG,Nz85G^ƕ ^'oo3\I( pM+]f [N5!xu5c6wfd 7ۡWJ8nkh Qoa|Q1Ȳn>$ Xt`|dlZ㢦‘X\!v*׃| }y_]%kfX$"fR *9+{ KdKmD.@cA_uG((KOq_Wnoyjmf((x$nVt \ł -<|A{#|:f2^VHlT!˓bH㷍 M:6j:ޕkg,5 'il`g$ZSK?j69ý&f̟BY.-&(c˝~^1óMkQY>զ4׃Xm!m"@u*qAj((((((d?K¾0?K¾0(gO?+_5=ܹZ((ou4DHP8Ad#ƞ> ie$M: wk鷍my[-,fP2q$|Oξ6]?O{|UEHERWnAg@H"O|7Pk^/m$Kvg%4qd`Jyʮ9+4^f{kWaxvKxQBEf$FI%0hLoe1k}OzEQEQEQEQE5#K$&ĝgiVȱwr"' ߩ6Oψv5Qq ܨ@xٝTFe$!#r7袊%k~뺗,?u >{,6hyk>-xSQϠ][>_jA[ַ;&SihH|0|Y< v7EWT4{EcAoxcIްETyr;j_?46fўO Aj\iW6cm*M*gRݑm78y;2†y,&o)Q~&U Dڽc'Im& +U dveM2ƻTA@gw_eZ7]7۠엮<z!<1=Kҭux;b89wI+9O>+%c) Λv%@ ̣G( uu--7Ő|vuU9\O'$((((((((+'W]|Q>cA_uG((KOq_Wnoyjmf(Pwo-3/ly;8O7c: cSH֛nT5նI_,[*Tv U_ 4Oi刍ͳV8,c&x3ƓX_yxg`j [ 8$P,ě^{f0D$ Cd2qʨ< [TQ@VG|(|Cq j:C4+ivy 1.T VXuP[RcbRO +~Z)jNi7/ThB%T:X9ܭx/PK)5Wҟ왗S q@4QEQEQEQEQEQE2_pa_Wpa_PEP_h'˟D?\ܭ~QEQEQ\_>7i l"UբPpxL8pǏ]7m+Uq㗷3%߸ ׾(VR`үIkXev߿|+w|a~<=xrI0i =ۏ4QUI|8}BE7ojr'-izzwJFs;ğzآ((Z—6:oqtJ?PMx>-NҦm*=FQpmQQgOLqVhd*7X~FvȘ =ihf JH%e5`< ?| ʻ_23Y' !,]r@|e^'k]C RGl#J4<9BۘBX源r29QEQEQEQEQEQEQEWO1 ُi_?|f?PEP_q򖟇KOq@Q@Q@Q@Q@s-uΩo;xZ, VBݼ9#jUH/%մ-e#H$ H`6AkoF7euYog1\oA#bV2v=@. &hЂB6=2H7 5κVʖQim4-/qۢ %@$l"~ Wp]xQMUH-sfl1s1 EyIvG"jz£mlU\{#޵|!AY\Ki.vZBI&N,:( ( ( (?/8ORwe+8ORwe((4Gn ~?p.V?g(ĖW?KAUnQD#QIb1lm'8WےFYT2# Ъ@@>me}> h+ I~iyхB3-=f((((BFX( _TDҮo-"y*2B,p;+/_n-{|$eT̑U;[smlp}C<_ZW(C2ua]HwW'cm!ojrƒ' G3  @>Td^Oc%F.O4 C2`"bM FI$ ..`\mPbIQEQE|]Q藮Ÿ&Dd_Z_{l'WrֿQER[jCuo2)P:8 Ep~? 'rQt2} A(xsgIJc a~W袀 ( ( ( RkmF$6XnY Hh je[ KYdlޞ4X cB5)a՗|0>=8EG5{6^u.}HӋkChfv`Eu4WоѮnag{m̱A,'kHSFCj>w4>!u褞4rFAp_/undJwtQEQEQEQEQEQE+c(ZE~WO1 ُiEWnoyjm+?77R6PEPEPEPE3I׾k6~b/.m$ _ W=SP=.G!0Pd)R|<-e5lfDk&l@e˾z(ݯk&CwEmkY0&@r(!v(9B/בRNRSnsFGi0UexA>t363!6PN9֭7מ;m/·{[j72(1e8]~S 3ڞՋݽޢqDc0lmO'{gi:(w*][ʧf ;Egڥӡz8rJG5V5OlK7Σ5oE|=eÍ7V݈K9nh|IpUp[ެk_xzGݝ2g[ۮfb'mmDۿ >0tY| z'6S}E1m Om+A/5h'g}YP f{pOa.:j ( ( ( ( ?1_-"}'W]|Q~h+O :ĝF-+JTeV(}®vFI=9?cJl@>=,uy_V2>p>W[_|]࿏7v| H5)BE~_ |f𓠺% -GOjmChT|A>gm>[<{KFoWᦹ始tr7$^ è;OJ *m7V"l&fWh;/ZH5;QTӟe\+g#  υn+]hΦx~1g8eTR~]2{h((((((K]7l+ ?]7l+ (+{ sr_h'˟( ( +Ə:f_xú+KFvג }_$kէzug!A +_l+`|>^cQXg+|k $-Yt3?M*FJ{Rxme"rxO(oO]Sǚ6pph ~$^U57>1u-Z[ۗ4/OGG?mxr1+|kKO?𵥀2}M~V?}o-̂H >34\fV>*$hڎn1i[)QtMk~>! 9ER}?eHBKYس1$Tڄֲ"Եw:I Wp\w Z޳f-"`HWlrGx=Rrvˏ 9ҵ1=|Vuk&@ԫV\y֋\ƹ5ϽSkz}\ 1KjH_R- tqqq;\P>+xkq+H3KIBw? yĿSZk:MpVJ?ݢzx޳zxހ/|"zxހ/ oz'@^u^΍sqgq),4nF";7ſ{cyq_`=ޜ&?xc_Qx%oGkҮ6/Kj qG"&>~qxʼ)1;Vp WXj=GM#y?~~! dǩEs`z&P!*rCE~/7~ 9aKn#׭x/ u g ,(g`TE~qx+ c|? lXh+ּ{'1$u(:+ż5㢫xKSQI,vB* U4_ڙizWq P@QEQE2_pa_Wpa_PEP_k%㴿5tEƀmC,?jČN#X~ޚfg]sMb~ zvi>^G?OD_RO ڜ=G =Y3SLX͚cO@?V3SzgU4hL3⪴Ɵހ-ƸqLi\Sު5ǽ1={kIRUHhRWJl99ͮ=9' nAiOa+\]6*99=btkD8?5#\{ު5ǽF4=6-dazW+u Yu1q]QeR6OLuljfC`;Al9ٮS%H 3$n}M2ɀ bvgu^<`0+qᏉzw?%1a??떵F?OAQ(((((OCB(욄aYmDIduVQ[tPxKѤ*~:hbyK$2''DE~)ZͷhwVB {T_]}\kLQ{y"n̺X\»[ 0x/=)[/|XeOVeԣ\\HP誇j$ _aj^:Uռ =R;C6l&ʋ"|B. Z<+uf n%R)Cm}@=(((((((~Hj ?1_-"?( %~?לW~-Iި,/,4}g jOYޙ}g,Ku>ޫq%ϝSϥa^/a#0 #oOsy廑Ln v Xz[f=k†I-UBRplǹ~v\mTێtMN{5ޜ& jpoZ% j% boZROIhRHܫ)r*%U_S!ϏN݊_sn_ҽkW~ !xxot)۴`+-8K@ #c>^+w]j]i?[s/_Zpހ>p|D~+S(5M'Qfe`0F;WU?Kxׯ^]@Q@}9}ۗWua/ 5W3S eML3f,?Vi=Y3zzZzzzz⪴Ƹ MqQǽUkzc\{5ǽUkz=뫧][H9$ OgMwM@prvNF8NSScAʞ?#Sހ-ϽD>U}֨go2@ ;$YpsX-: dڑ>c3< ϼY'YD( $>{r9%ms2wx {!\親rY.[NJIzgī}6E5Il'WrֿG (1\>5QEQEQEQEQEQEQEQEQEQEQEQEQEQEQE+c(ZE~WO1 ُiEW-ƄI>am3]JgAgz5[cEX HING^s5s.:&a"N3H-׃O ROYϞހ/ 5QYj/x;!,g>&{ P0oTr]#5A] 㻰Q nz4 ozmB(e5 Mbo,Q@oSOWjii3Eo%s,w=_ZZͮ.-x$!c_lۃ158MTmܒ(e#= H& jp i-[ӄZ& Ƞ b_8Kjps;i/Xk+lSo%ʀ ( G kg?-?@JƟޫƚ,ƞ4֟ހ#CI%a~{sXW6EX 6FÙX+`]M.eɕǨێYvz}+ Kq0v%d |Q=Ԓ,I#>f }+z'zqܝVހ,ƞ4O@ZzcOU|Tm=YiqUZz=\{mqU^ޣkz{gހ0r|GO_?-֠ɺfrOϵfsI=ingBWw8p8,j $3rOA@ {eM>_<{=^Ɵa$cLbpN 1q^-=ϽRѯJB0`H8 >isPs@!,Y19"n3OE^{zCj~x4Iͤ\DԬWG>W_=vh>"נ]#į4vC{X^Et٢e[8޽D)Y4&6MZdYLH?2 dnhho™nX Jɭ`h û| T!C+o|C?!ӿ|C?!ӿtoMG⯎txLBڎ/!lvqᕯk:QHl-eY$ Fg+63|C?!ӿ|C?!ӿYo|C?!ӿ|C?!ӿYo|C?!ӿ|C?!ӿYo|C?!ӿ|C?!ӿYo|C?!ӿ|C?!ӿYo|C?!ӿ9࿿g|'|B5?͊04ݟ*?cd 4QE~-fZgF?DI@Z"Y?HM1u/'GqUk= {k,GpPo>ҡVPsj X ʫ$3 rc"l6;(Zzsv~X5qfjͲ1pTDbrG*$ncrT0SOQY4azM?Ykz={o?Y{z=Q=ǽKKpHEp3~ UL2qJTkM8?Ҁ-C# tvqUޡ{z g^ހ-IsPsU^ޡހ,sX:\gR\?.7xKUg7hoMgK'a}?`6, ooW}}ǛGUz!Wphٝ>qR!` 6z!~ n|/}=mFO6=6cG918I~.xc }vA" WL#L,˵G}ytImiw Q~47)/<)f]\h 5<iHء[ oٖ^x{sqgY~Ec +xPQh^6jXI3#c袀>+Gu|_x{XZeme,%̦jI:}ڤn6G$$x xlx{}8j/u "ڸaal+q^-7ޏ7ޫycooX}}꿛G@A(ymts]Dc [Ov2PI?JrOR,ԋ?g"@ˉAJ_+sK1L.AvYE$ytn#P˯G_ZȑC! #jT5q5!?6TIk8 W=}hV /|݊Ǒպ`~5}f  * @LP嚞&+5=f ozzTjz@jxi-\SZĴpKO1-8K@-Tӄhw~^ ?Wע~w|׿E^w@Q@{>?('֯xf_@{m7Fv$i?FzH=VjpwvԄ1MG-r3%9%1[QGB=cQ]I(z"RX~uGw}8]FYCm .~ Q'ߖ1aPI[/޸ OĤ1QM `@ }n#`j&֡' U|G3г#^TU5;q'%^}f>c£m]D&ҝ%D.uWX)˜T@1ל*_4FS&'N{MqQ=ǽG!2(I.=.}9&\2\rI%Ͻdxw!Hk'g)C }eɏS£{|߭x,O2,sq47;q47;q47;q47;q47;q47_w[xFwM4vQP-ZQf&Րd_Gs? )Ih/F@'8̰޶{W\Rػ m֟8f'ZT*Co .[L|e =<߭yÏVi~ca %w ߭o֡}M ߭o֡}M ߭o֡}M ߭o֡}M ߭o֡}M ߭o֡}M ߭o֡}M ߭|Pmu{?G?m@$EW])Uҏv/\uD}ԫ?RN*(OR$MO*hR,MRZS|e\> ;bdvBW?2MJLJ͎?zVjfޤC[Y7UN?5ZYEg%ZY5V)r9 "ZzUN(Nzzhe<ڤxk?jN~;WEPPhyBin-0.3/website/website.css000066400000000000000000000023551221636131000163630ustar00rootroot00000000000000body { margin: auto; padding-right: 1em; padding-left: 1em; max-width: 44em; border-left: 1px solid black; border-right: 1px solid black; color: black; font-family: Verdana, sans-serif; font-size: 100%; line-height: 140%; color: #333; } pre { border: 1px dotted gray; background-color: #ececec; color: #1111111; padding: 0.5em; } code { font-family: monospace; } h1 a, h2 a, h3 a, h4 a, h5 a { text-decoration: none; color: #7a5ada; } h1, h2, h3, h4, h5 { font-family: verdana; font-weight: bold; border-bottom: 1px dotted black; color: #7a5ada; } h1 { font-size: 130%; } h2 { font-size: 110%; } h3 { font-size: 95%; } h4 { font-size: 90%; font-style: italic; } h5 { font-size: 90%; font-style: italic; } h1.title { font-size: 200%; font-weight: bold; padding-top: 0.2em; padding-bottom: 0.2em; text-align: left; border: none; } dt code { font-weight: bold; } dd p { margin-top: 0; } #footer { padding-top: 1em; font-size: 70%; color: gray; text-align: center; }