{-# LANGUAGE BangPatterns #-} {-# LANGUAGE ExplicitNamespaces #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeApplications #-} module DataFrame.Operations.Subset where import qualified Data.List as L import qualified Data.Map as M import qualified Data.Set as S import qualified Data.Text as T import qualified Data.Vector as V import qualified Data.Vector.Generic as VG import qualified Data.Vector.Unboxed as VU import qualified Prelude import Control.Exception (throw) import Data.Function ((&)) import Data.Maybe ( fromJust, fromMaybe, isJust, isNothing, ) import Data.Type.Equality (TestEquality (..)) import DataFrame.Errors ( DataFrameException (..), TypeErrorContext (..), ) import DataFrame.Internal.Column import DataFrame.Internal.DataFrame ( DataFrame (..), derivingExpressions, empty, getColumn, unsafeGetColumn, ) import DataFrame.Internal.Expression import DataFrame.Internal.Interpreter import DataFrame.Operations.Core import DataFrame.Operations.Merge () import DataFrame.Operations.Transformations (apply) import System.Random import Type.Reflection import Prelude hiding (filter, take) take :: Int -> DataFrame -> DataFrame take :: Int -> DataFrame -> DataFrame take Int n DataFrame d = DataFrame d{columns = V.map (takeColumn n') (columns d), dataframeDimensions = (n', c)} where (Int r, Int c) = DataFrame -> (Int, Int) dataframeDimensions DataFrame d n' :: Int n' = Int -> Int -> Int -> Int clip Int n Int 0 Int r takeLast :: Int -> DataFrame -> DataFrame takeLast :: Int -> DataFrame -> DataFrame takeLast Int n DataFrame d = DataFrame d { columns = V.map (takeLastColumn n') (columns d) , dataframeDimensions = (n', c) } where (Int r, Int c) = DataFrame -> (Int, Int) dataframeDimensions DataFrame d n' :: Int n' = Int -> Int -> Int -> Int clip Int n Int 0 Int r drop :: Int -> DataFrame -> DataFrame drop :: Int -> DataFrame -> DataFrame drop Int n DataFrame d = DataFrame d { columns = V.map (sliceColumn n' (max (r - n') 0)) (columns d) , dataframeDimensions = (max (r - n') 0, c) } where (Int r, Int c) = DataFrame -> (Int, Int) dataframeDimensions DataFrame d n' :: Int n' = Int -> Int -> Int -> Int clip Int n Int 0 Int r dropLast :: Int -> DataFrame -> DataFrame dropLast :: Int -> DataFrame -> DataFrame dropLast Int n DataFrame d = DataFrame d{columns = V.map (sliceColumn 0 n') (columns d), dataframeDimensions = (n', c)} where (Int r, Int c) = DataFrame -> (Int, Int) dataframeDimensions DataFrame d n' :: Int n' = Int -> Int -> Int -> Int clip (Int r Int -> Int -> Int forall a. Num a => a -> a -> a - Int n) Int 0 Int r range :: (Int, Int) -> DataFrame -> DataFrame range :: (Int, Int) -> DataFrame -> DataFrame range (Int start, Int end) DataFrame d = DataFrame d { columns = V.map (sliceColumn (clip start 0 r) n') (columns d) , dataframeDimensions = (n', c) } where (Int r, Int c) = DataFrame -> (Int, Int) dataframeDimensions DataFrame d n' :: Int n' = Int -> Int -> Int -> Int clip (Int end Int -> Int -> Int forall a. Num a => a -> a -> a - Int start) Int 0 Int r clip :: Int -> Int -> Int -> Int clip :: Int -> Int -> Int -> Int clip Int n Int left Int right = Int -> Int -> Int forall a. Ord a => a -> a -> a min Int right (Int -> Int) -> Int -> Int forall a b. (a -> b) -> a -> b $ Int -> Int -> Int forall a. Ord a => a -> a -> a max Int n Int left filter :: forall a. (Columnable a) => Expr a -> (a -> Bool) -> DataFrame -> DataFrame filter :: forall a. Columnable a => Expr a -> (a -> Bool) -> DataFrame -> DataFrame filter (Col Text filterColumnName) a -> Bool condition DataFrame df = case Text -> DataFrame -> Maybe Column getColumn Text filterColumnName DataFrame df of Maybe Column Nothing -> DataFrameException -> DataFrame forall a e. Exception e => e -> a throw (DataFrameException -> DataFrame) -> DataFrameException -> DataFrame forall a b. (a -> b) -> a -> b $ Text -> Text -> [Text] -> DataFrameException ColumnNotFoundException Text filterColumnName Text "filter" (Map Text Int -> [Text] forall k a. Map k a -> [k] M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text] forall a b. (a -> b) -> a -> b $ DataFrame -> Map Text Int columnIndices DataFrame df) Just (BoxedColumn (Vector a column :: V.Vector b)) -> Text -> Vector a -> (a -> Bool) -> DataFrame -> DataFrame forall a b (v :: * -> *). (Vector v b, Vector v Int, Columnable a, Columnable b) => Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame filterByVector Text filterColumnName Vector a column a -> Bool condition DataFrame df Just (OptionalColumn (Vector (Maybe a) column :: V.Vector b)) -> Text -> Vector (Maybe a) -> (a -> Bool) -> DataFrame -> DataFrame forall a b (v :: * -> *). (Vector v b, Vector v Int, Columnable a, Columnable b) => Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame filterByVector Text filterColumnName Vector (Maybe a) column a -> Bool condition DataFrame df Just (UnboxedColumn (Vector a column :: VU.Vector b)) -> Text -> Vector a -> (a -> Bool) -> DataFrame -> DataFrame forall a b (v :: * -> *). (Vector v b, Vector v Int, Columnable a, Columnable b) => Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame filterByVector Text filterColumnName Vector a column a -> Bool condition DataFrame df filter Expr a expr a -> Bool condition DataFrame df = let (TColumn Column col) = case forall a. Columnable a => DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) interpret @a DataFrame df (Expr a -> Expr a forall a. (Eq a, Ord a, Show a, Typeable a) => Expr a -> Expr a normalize Expr a expr) of Left DataFrameException e -> DataFrameException -> TypedColumn a forall a e. Exception e => e -> a throw DataFrameException e Right TypedColumn a c -> TypedColumn a c indexes :: Vector Int indexes = case (a -> Bool) -> Column -> Either DataFrameException (Vector Int) forall a. Columnable a => (a -> Bool) -> Column -> Either DataFrameException (Vector Int) findIndices a -> Bool condition Column col of Right Vector Int ixs -> Vector Int ixs Left DataFrameException e -> DataFrameException -> Vector Int forall a e. Exception e => e -> a throw DataFrameException e c' :: Int c' = (Int, Int) -> Int forall a b. (a, b) -> b snd ((Int, Int) -> Int) -> (Int, Int) -> Int forall a b. (a -> b) -> a -> b $ DataFrame -> (Int, Int) dataframeDimensions DataFrame df in DataFrame df { columns = V.map (atIndicesStable indexes) (columns df) , dataframeDimensions = (VU.length indexes, c') } filterByVector :: forall a b v. (VG.Vector v b, VG.Vector v Int, Columnable a, Columnable b) => T.Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame filterByVector :: forall a b (v :: * -> *). (Vector v b, Vector v Int, Columnable a, Columnable b) => Text -> v b -> (a -> Bool) -> DataFrame -> DataFrame filterByVector Text filterColumnName v b column a -> Bool condition DataFrame df = case TypeRep a -> TypeRep b -> Maybe (a :~: b) forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b) forall {k} (f :: k -> *) (a :: k) (b :: k). TestEquality f => f a -> f b -> Maybe (a :~: b) testEquality (forall a. Typeable a => TypeRep a forall {k} (a :: k). Typeable a => TypeRep a typeRep @a) (forall a. Typeable a => TypeRep a forall {k} (a :: k). Typeable a => TypeRep a typeRep @b) of Maybe (a :~: b) Nothing -> DataFrameException -> DataFrame forall a e. Exception e => e -> a throw (DataFrameException -> DataFrame) -> DataFrameException -> DataFrame forall a b. (a -> b) -> a -> b $ TypeErrorContext a b -> DataFrameException forall a b. (Typeable a, Typeable b) => TypeErrorContext a b -> DataFrameException TypeMismatchException ( MkTypeErrorContext { userType :: Either String (TypeRep a) userType = TypeRep a -> Either String (TypeRep a) forall a b. b -> Either a b Right (TypeRep a -> Either String (TypeRep a)) -> TypeRep a -> Either String (TypeRep a) forall a b. (a -> b) -> a -> b $ forall a. Typeable a => TypeRep a forall {k} (a :: k). Typeable a => TypeRep a typeRep @a , expectedType :: Either String (TypeRep b) expectedType = TypeRep b -> Either String (TypeRep b) forall a b. b -> Either a b Right (TypeRep b -> Either String (TypeRep b)) -> TypeRep b -> Either String (TypeRep b) forall a b. (a -> b) -> a -> b $ forall a. Typeable a => TypeRep a forall {k} (a :: k). Typeable a => TypeRep a typeRep @b , errorColumnName :: Maybe String errorColumnName = String -> Maybe String forall a. a -> Maybe a Just (Text -> String T.unpack Text filterColumnName) , callingFunctionName :: Maybe String callingFunctionName = String -> Maybe String forall a. a -> Maybe a Just String "filter" } ) Just a :~: b Refl -> let ixs :: Vector Int ixs = v Int -> Vector Int forall (v :: * -> *) a (w :: * -> *). (Vector v a, Vector w a) => v a -> w a VG.convert ((a -> Bool) -> v a -> v Int forall (v :: * -> *) a. (Vector v a, Vector v Int) => (a -> Bool) -> v a -> v Int VG.findIndices a -> Bool condition v a v b column) in DataFrame df { columns = V.map (atIndicesStable ixs) (columns df) , dataframeDimensions = (VG.length ixs, snd (dataframeDimensions df)) } filterBy :: (Columnable a) => (a -> Bool) -> Expr a -> DataFrame -> DataFrame filterBy :: forall a. Columnable a => (a -> Bool) -> Expr a -> DataFrame -> DataFrame filterBy = (Expr a -> (a -> Bool) -> DataFrame -> DataFrame) -> (a -> Bool) -> Expr a -> DataFrame -> DataFrame forall a b c. (a -> b -> c) -> b -> a -> c flip Expr a -> (a -> Bool) -> DataFrame -> DataFrame forall a. Columnable a => Expr a -> (a -> Bool) -> DataFrame -> DataFrame filter filterWhere :: Expr Bool -> DataFrame -> DataFrame filterWhere :: Expr Bool -> DataFrame -> DataFrame filterWhere Expr Bool expr DataFrame df = let (TColumn Column col) = case forall a. Columnable a => DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) interpret @Bool DataFrame df (Expr Bool -> Expr Bool forall a. (Eq a, Ord a, Show a, Typeable a) => Expr a -> Expr a normalize Expr Bool expr) of Left DataFrameException e -> DataFrameException -> TypedColumn Bool forall a e. Exception e => e -> a throw DataFrameException e Right TypedColumn Bool c -> TypedColumn Bool c indexes :: Vector Int indexes = case (Bool -> Bool) -> Column -> Either DataFrameException (Vector Int) forall a. Columnable a => (a -> Bool) -> Column -> Either DataFrameException (Vector Int) findIndices Bool -> Bool forall a. a -> a id Column col of Right Vector Int ixs -> Vector Int ixs Left DataFrameException e -> DataFrameException -> Vector Int forall a e. Exception e => e -> a throw DataFrameException e c' :: Int c' = (Int, Int) -> Int forall a b. (a, b) -> b snd ((Int, Int) -> Int) -> (Int, Int) -> Int forall a b. (a -> b) -> a -> b $ DataFrame -> (Int, Int) dataframeDimensions DataFrame df in DataFrame df { columns = V.map (atIndicesStable indexes) (columns df) , dataframeDimensions = (VU.length indexes, c') } filterJust :: T.Text -> DataFrame -> DataFrame filterJust :: Text -> DataFrame -> DataFrame filterJust Text name DataFrame df = case Text -> DataFrame -> Maybe Column getColumn Text name DataFrame df of Maybe Column Nothing -> DataFrameException -> DataFrame forall a e. Exception e => e -> a throw (DataFrameException -> DataFrame) -> DataFrameException -> DataFrame forall a b. (a -> b) -> a -> b $ Text -> Text -> [Text] -> DataFrameException ColumnNotFoundException Text name Text "filterJust" (Map Text Int -> [Text] forall k a. Map k a -> [k] M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text] forall a b. (a -> b) -> a -> b $ DataFrame -> Map Text Int columnIndices DataFrame df) Just column :: Column column@(OptionalColumn (Vector (Maybe a) col :: V.Vector (Maybe a))) -> Expr (Maybe a) -> (Maybe a -> Bool) -> DataFrame -> DataFrame forall a. Columnable a => Expr a -> (a -> Bool) -> DataFrame -> DataFrame filter (forall a. Columnable a => Text -> Expr a Col @(Maybe a) Text name) Maybe a -> Bool forall a. Maybe a -> Bool isJust DataFrame df DataFrame -> (DataFrame -> DataFrame) -> DataFrame forall a b. a -> (a -> b) -> b & forall b c. (Columnable b, Columnable c) => (b -> c) -> Text -> DataFrame -> DataFrame apply @(Maybe a) Maybe a -> a forall a. HasCallStack => Maybe a -> a fromJust Text name Just Column column -> DataFrame df filterNothing :: T.Text -> DataFrame -> DataFrame filterNothing :: Text -> DataFrame -> DataFrame filterNothing Text name DataFrame df = case Text -> DataFrame -> Maybe Column getColumn Text name DataFrame df of Maybe Column Nothing -> DataFrameException -> DataFrame forall a e. Exception e => e -> a throw (DataFrameException -> DataFrame) -> DataFrameException -> DataFrame forall a b. (a -> b) -> a -> b $ Text -> Text -> [Text] -> DataFrameException ColumnNotFoundException Text name Text "filterNothing" (Map Text Int -> [Text] forall k a. Map k a -> [k] M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text] forall a b. (a -> b) -> a -> b $ DataFrame -> Map Text Int columnIndices DataFrame df) Just (OptionalColumn (Vector (Maybe a) col :: V.Vector (Maybe a))) -> Expr (Maybe a) -> (Maybe a -> Bool) -> DataFrame -> DataFrame forall a. Columnable a => Expr a -> (a -> Bool) -> DataFrame -> DataFrame filter (forall a. Columnable a => Text -> Expr a Col @(Maybe a) Text name) Maybe a -> Bool forall a. Maybe a -> Bool isNothing DataFrame df Maybe Column _ -> DataFrame df filterAllJust :: DataFrame -> DataFrame filterAllJust :: DataFrame -> DataFrame filterAllJust DataFrame df = (Text -> DataFrame -> DataFrame) -> DataFrame -> [Text] -> DataFrame forall a b. (a -> b -> b) -> b -> [a] -> b forall (t :: * -> *) a b. Foldable t => (a -> b -> b) -> b -> t a -> b foldr Text -> DataFrame -> DataFrame filterJust DataFrame df (DataFrame -> [Text] columnNames DataFrame df) filterAllNothing :: DataFrame -> DataFrame filterAllNothing :: DataFrame -> DataFrame filterAllNothing DataFrame df = (Text -> DataFrame -> DataFrame) -> DataFrame -> [Text] -> DataFrame forall a b. (a -> b -> b) -> b -> [a] -> b forall (t :: * -> *) a b. Foldable t => (a -> b -> b) -> b -> t a -> b foldr Text -> DataFrame -> DataFrame filterNothing DataFrame df (DataFrame -> [Text] columnNames DataFrame df) cube :: (Int, Int) -> DataFrame -> DataFrame cube :: (Int, Int) -> DataFrame -> DataFrame cube (Int length, Int width) = Int -> DataFrame -> DataFrame take Int length (DataFrame -> DataFrame) -> (DataFrame -> DataFrame) -> DataFrame -> DataFrame forall b c a. (b -> c) -> (a -> b) -> a -> c . [SelectionCriteria] -> DataFrame -> DataFrame selectBy [(Int, Int) -> SelectionCriteria ColumnIndexRange (Int 0, Int width Int -> Int -> Int forall a. Num a => a -> a -> a - Int 1)] select :: [T.Text] -> DataFrame -> DataFrame select :: [Text] -> DataFrame -> DataFrame select [Text] cs DataFrame df | [Text] -> Bool forall a. [a] -> Bool forall (t :: * -> *) a. Foldable t => t a -> Bool L.null [Text] cs = DataFrame empty | (Text -> Bool) -> [Text] -> Bool forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool any (Text -> [Text] -> Bool forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool `notElem` DataFrame -> [Text] columnNames DataFrame df) [Text] cs = DataFrameException -> DataFrame forall a e. Exception e => e -> a throw (DataFrameException -> DataFrame) -> DataFrameException -> DataFrame forall a b. (a -> b) -> a -> b $ Text -> Text -> [Text] -> DataFrameException ColumnNotFoundException (String -> Text T.pack (String -> Text) -> String -> Text forall a b. (a -> b) -> a -> b $ [Text] -> String forall a. Show a => a -> String show ([Text] -> String) -> [Text] -> String forall a b. (a -> b) -> a -> b $ [Text] cs [Text] -> [Text] -> [Text] forall a. Eq a => [a] -> [a] -> [a] L.\\ DataFrame -> [Text] columnNames DataFrame df) Text "select" (DataFrame -> [Text] columnNames DataFrame df) | Bool otherwise = let result :: DataFrame result = (DataFrame -> Text -> DataFrame) -> DataFrame -> [Text] -> DataFrame forall b a. (b -> a -> b) -> b -> [a] -> b forall (t :: * -> *) b a. Foldable t => (b -> a -> b) -> b -> t a -> b L.foldl' DataFrame -> Text -> DataFrame addKeyValue DataFrame empty [Text] cs filteredExprs :: Map Text UExpr filteredExprs = (Text -> UExpr -> Bool) -> Map Text UExpr -> Map Text UExpr forall k a. (k -> a -> Bool) -> Map k a -> Map k a M.filterWithKey (\Text k UExpr _ -> Text k Text -> [Text] -> Bool forall a. Eq a => a -> [a] -> Bool forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool `L.elem` [Text] cs) (DataFrame -> Map Text UExpr derivingExpressions DataFrame df) in DataFrame result{derivingExpressions = filteredExprs} where addKeyValue :: DataFrame -> Text -> DataFrame addKeyValue DataFrame d Text k = DataFrame -> Maybe DataFrame -> DataFrame forall a. a -> Maybe a -> a fromMaybe DataFrame df (Maybe DataFrame -> DataFrame) -> Maybe DataFrame -> DataFrame forall a b. (a -> b) -> a -> b $ do Column col <- Text -> DataFrame -> Maybe Column getColumn Text k DataFrame df DataFrame -> Maybe DataFrame forall a. a -> Maybe a forall (f :: * -> *) a. Applicative f => a -> f a pure (DataFrame -> Maybe DataFrame) -> DataFrame -> Maybe DataFrame forall a b. (a -> b) -> a -> b $ Text -> Column -> DataFrame -> DataFrame insertColumn Text k Column col DataFrame d data SelectionCriteria = ColumnProperty (Column -> Bool) | ColumnNameProperty (T.Text -> Bool) | ColumnTextRange (T.Text, T.Text) | ColumnIndexRange (Int, Int) | ColumnName T.Text byName :: T.Text -> SelectionCriteria byName :: Text -> SelectionCriteria byName = Text -> SelectionCriteria ColumnName byProperty :: (Column -> Bool) -> SelectionCriteria byProperty :: (Column -> Bool) -> SelectionCriteria byProperty = (Column -> Bool) -> SelectionCriteria ColumnProperty byNameProperty :: (T.Text -> Bool) -> SelectionCriteria byNameProperty :: (Text -> Bool) -> SelectionCriteria byNameProperty = (Text -> Bool) -> SelectionCriteria ColumnNameProperty byNameRange :: (T.Text, T.Text) -> SelectionCriteria byNameRange :: (Text, Text) -> SelectionCriteria byNameRange = (Text, Text) -> SelectionCriteria ColumnTextRange byIndexRange :: (Int, Int) -> SelectionCriteria byIndexRange :: (Int, Int) -> SelectionCriteria byIndexRange = (Int, Int) -> SelectionCriteria ColumnIndexRange selectBy :: [SelectionCriteria] -> DataFrame -> DataFrame selectBy :: [SelectionCriteria] -> DataFrame -> DataFrame selectBy [SelectionCriteria] xs DataFrame df = [Text] -> DataFrame -> DataFrame select [Text] finalSelection DataFrame df where finalSelection :: [Text] finalSelection = (Text -> Bool) -> [Text] -> [Text] forall a. (a -> Bool) -> [a] -> [a] Prelude.filter (Text -> Set Text -> Bool forall a. Ord a => a -> Set a -> Bool `S.member` Set Text columnsWithProperties) (DataFrame -> [Text] columnNames DataFrame df) columnsWithProperties :: Set Text columnsWithProperties = [Text] -> Set Text forall a. Ord a => [a] -> Set a S.fromList (([Text] -> SelectionCriteria -> [Text]) -> [Text] -> [SelectionCriteria] -> [Text] forall b a. (b -> a -> b) -> b -> [a] -> b forall (t :: * -> *) b a. Foldable t => (b -> a -> b) -> b -> t a -> b L.foldl' [Text] -> SelectionCriteria -> [Text] columnWithProperty [] [SelectionCriteria] xs) columnWithProperty :: [Text] -> SelectionCriteria -> [Text] columnWithProperty [Text] acc (ColumnName Text name) = [Text] acc [Text] -> [Text] -> [Text] forall a. [a] -> [a] -> [a] ++ [Text name] columnWithProperty [Text] acc (ColumnNameProperty Text -> Bool f) = [Text] acc [Text] -> [Text] -> [Text] forall a. [a] -> [a] -> [a] ++ (Text -> Bool) -> [Text] -> [Text] forall a. (a -> Bool) -> [a] -> [a] L.filter Text -> Bool f (DataFrame -> [Text] columnNames DataFrame df) columnWithProperty [Text] acc (ColumnTextRange (Text from, Text to)) = [Text] acc [Text] -> [Text] -> [Text] forall a. [a] -> [a] -> [a] ++ [Text] -> [Text] forall a. [a] -> [a] reverse ((Text -> Bool) -> [Text] -> [Text] forall a. (a -> Bool) -> [a] -> [a] Prelude.dropWhile (Text to Text -> Text -> Bool forall a. Eq a => a -> a -> Bool /=) ([Text] -> [Text]) -> [Text] -> [Text] forall a b. (a -> b) -> a -> b $ [Text] -> [Text] forall a. [a] -> [a] reverse ([Text] -> [Text]) -> [Text] -> [Text] forall a b. (a -> b) -> a -> b $ (Text -> Bool) -> [Text] -> [Text] forall a. (a -> Bool) -> [a] -> [a] dropWhile (Text from Text -> Text -> Bool forall a. Eq a => a -> a -> Bool /=) (DataFrame -> [Text] columnNames DataFrame df)) columnWithProperty [Text] acc (ColumnIndexRange (Int from, Int to)) = [Text] acc [Text] -> [Text] -> [Text] forall a. [a] -> [a] -> [a] ++ Int -> [Text] -> [Text] forall a. Int -> [a] -> [a] Prelude.take (Int to Int -> Int -> Int forall a. Num a => a -> a -> a - Int from Int -> Int -> Int forall a. Num a => a -> a -> a + Int 1) (Int -> [Text] -> [Text] forall a. Int -> [a] -> [a] Prelude.drop Int from (DataFrame -> [Text] columnNames DataFrame df)) columnWithProperty [Text] acc (ColumnProperty Column -> Bool f) = [Text] acc [Text] -> [Text] -> [Text] forall a. [a] -> [a] -> [a] ++ ((Text, Int) -> Text) -> [(Text, Int)] -> [Text] forall a b. (a -> b) -> [a] -> [b] map (Text, Int) -> Text forall a b. (a, b) -> a fst (((Text, Int) -> Bool) -> [(Text, Int)] -> [(Text, Int)] forall a. (a -> Bool) -> [a] -> [a] L.filter (\(Text k, Int v) -> Int v Int -> [Int] -> Bool forall a. Eq a => a -> [a] -> Bool forall (t :: * -> *) a. (Foldable t, Eq a) => a -> t a -> Bool `elem` [Int] ixs) (Map Text Int -> [(Text, Int)] forall k a. Map k a -> [(k, a)] M.toAscList (DataFrame -> Map Text Int columnIndices DataFrame df))) where ixs :: [Int] ixs = ([Int] -> Int -> Column -> [Int]) -> [Int] -> Vector Column -> [Int] forall a b. (a -> Int -> b -> a) -> a -> Vector b -> a V.ifoldl' (\[Int] acc Int i Column c -> if Column -> Bool f Column c then Int i Int -> [Int] -> [Int] forall a. a -> [a] -> [a] : [Int] acc else [Int] acc) [] (DataFrame -> Vector Column columns DataFrame df) exclude :: [T.Text] -> DataFrame -> DataFrame exclude :: [Text] -> DataFrame -> DataFrame exclude [Text] cs DataFrame df = let keysToKeep :: [Text] keysToKeep = DataFrame -> [Text] columnNames DataFrame df [Text] -> [Text] -> [Text] forall a. Eq a => [a] -> [a] -> [a] L.\\ [Text] cs in [Text] -> DataFrame -> DataFrame select [Text] keysToKeep DataFrame df sample :: (RandomGen g) => g -> Double -> DataFrame -> DataFrame sample :: forall g. RandomGen g => g -> Double -> DataFrame -> DataFrame sample g pureGen Double p DataFrame df = let rand :: Vector Double rand = g -> Int -> Vector Double forall g. RandomGen g => g -> Int -> Vector Double generateRandomVector g pureGen ((Int, Int) -> Int forall a b. (a, b) -> a fst (DataFrame -> (Int, Int) dataframeDimensions DataFrame df)) in DataFrame df DataFrame -> (DataFrame -> DataFrame) -> DataFrame forall a b. a -> (a -> b) -> b & Text -> Vector Double -> DataFrame -> DataFrame forall a. (Columnable a, Unbox a) => Text -> Vector a -> DataFrame -> DataFrame insertUnboxedVector Text "__rand__" Vector Double rand DataFrame -> (DataFrame -> DataFrame) -> DataFrame forall a b. a -> (a -> b) -> b & Expr Bool -> DataFrame -> DataFrame filterWhere ( BinaryOp Double Double Bool -> Expr Double -> Expr Double -> Expr Bool forall c b a. (Columnable c, Columnable b, Columnable a) => BinaryOp c b a -> Expr c -> Expr b -> Expr a Binary ( MkBinaryOp { binaryFn :: Double -> Double -> Bool binaryFn = Double -> Double -> Bool forall a. Ord a => a -> a -> Bool (>=) , binaryName :: Text binaryName = Text "geq" , binarySymbol :: Maybe Text binarySymbol = Text -> Maybe Text forall a. a -> Maybe a Just Text ">=" , binaryCommutative :: Bool binaryCommutative = Bool False , binaryPrecedence :: Int binaryPrecedence = Int 1 } ) (forall a. Columnable a => Text -> Expr a Col @Double Text "__rand__") (Double -> Expr Double forall a. Columnable a => a -> Expr a Lit (Double 1 Double -> Double -> Double forall a. Num a => a -> a -> a - Double p)) ) DataFrame -> (DataFrame -> DataFrame) -> DataFrame forall a b. a -> (a -> b) -> b & [Text] -> DataFrame -> DataFrame exclude [Text "__rand__"] randomSplit :: (RandomGen g) => g -> Double -> DataFrame -> (DataFrame, DataFrame) randomSplit :: forall g. RandomGen g => g -> Double -> DataFrame -> (DataFrame, DataFrame) randomSplit g pureGen Double p DataFrame df = let rand :: Vector Double rand = g -> Int -> Vector Double forall g. RandomGen g => g -> Int -> Vector Double generateRandomVector g pureGen ((Int, Int) -> Int forall a b. (a, b) -> a fst (DataFrame -> (Int, Int) dataframeDimensions DataFrame df)) withRand :: DataFrame withRand = DataFrame df DataFrame -> (DataFrame -> DataFrame) -> DataFrame forall a b. a -> (a -> b) -> b & Text -> Vector Double -> DataFrame -> DataFrame forall a. (Columnable a, Unbox a) => Text -> Vector a -> DataFrame -> DataFrame insertUnboxedVector Text "__rand__" Vector Double rand in ( DataFrame withRand DataFrame -> (DataFrame -> DataFrame) -> DataFrame forall a b. a -> (a -> b) -> b & Expr Bool -> DataFrame -> DataFrame filterWhere ( BinaryOp Double Double Bool -> Expr Double -> Expr Double -> Expr Bool forall c b a. (Columnable c, Columnable b, Columnable a) => BinaryOp c b a -> Expr c -> Expr b -> Expr a Binary ( MkBinaryOp { binaryFn :: Double -> Double -> Bool binaryFn = Double -> Double -> Bool forall a. Ord a => a -> a -> Bool (<=) , binaryName :: Text binaryName = Text "leq" , binarySymbol :: Maybe Text binarySymbol = Text -> Maybe Text forall a. a -> Maybe a Just Text "<=" , binaryCommutative :: Bool binaryCommutative = Bool False , binaryPrecedence :: Int binaryPrecedence = Int 1 } ) (forall a. Columnable a => Text -> Expr a Col @Double Text "__rand__") (Double -> Expr Double forall a. Columnable a => a -> Expr a Lit Double p) ) DataFrame -> (DataFrame -> DataFrame) -> DataFrame forall a b. a -> (a -> b) -> b & [Text] -> DataFrame -> DataFrame exclude [Text "__rand__"] , DataFrame withRand DataFrame -> (DataFrame -> DataFrame) -> DataFrame forall a b. a -> (a -> b) -> b & Expr Bool -> DataFrame -> DataFrame filterWhere ( BinaryOp Double Double Bool -> Expr Double -> Expr Double -> Expr Bool forall c b a. (Columnable c, Columnable b, Columnable a) => BinaryOp c b a -> Expr c -> Expr b -> Expr a Binary ( MkBinaryOp { binaryFn :: Double -> Double -> Bool binaryFn = Double -> Double -> Bool forall a. Ord a => a -> a -> Bool (>) , binaryName :: Text binaryName = Text "gt" , binarySymbol :: Maybe Text binarySymbol = Text -> Maybe Text forall a. a -> Maybe a Just Text ">" , binaryCommutative :: Bool binaryCommutative = Bool False , binaryPrecedence :: Int binaryPrecedence = Int 1 } ) (forall a. Columnable a => Text -> Expr a Col @Double Text "__rand__") (Double -> Expr Double forall a. Columnable a => a -> Expr a Lit Double p) ) DataFrame -> (DataFrame -> DataFrame) -> DataFrame forall a b. a -> (a -> b) -> b & [Text] -> DataFrame -> DataFrame exclude [Text "__rand__"] ) kFolds :: (RandomGen g) => g -> Int -> DataFrame -> [DataFrame] kFolds :: forall g. RandomGen g => g -> Int -> DataFrame -> [DataFrame] kFolds g pureGen Int folds DataFrame df = let rand :: Vector Double rand = g -> Int -> Vector Double forall g. RandomGen g => g -> Int -> Vector Double generateRandomVector g pureGen ((Int, Int) -> Int forall a b. (a, b) -> a fst (DataFrame -> (Int, Int) dataframeDimensions DataFrame df)) withRand :: DataFrame withRand = DataFrame df DataFrame -> (DataFrame -> DataFrame) -> DataFrame forall a b. a -> (a -> b) -> b & Text -> Vector Double -> DataFrame -> DataFrame forall a. (Columnable a, Unbox a) => Text -> Vector a -> DataFrame -> DataFrame insertUnboxedVector Text "__rand__" Vector Double rand partitionSize :: Double partitionSize = Double 1 Double -> Double -> Double forall a. Fractional a => a -> a -> a / Int -> Double forall a b. (Integral a, Num b) => a -> b fromIntegral Int folds singleFold :: Int -> DataFrame -> DataFrame singleFold Int n DataFrame d = DataFrame d DataFrame -> (DataFrame -> DataFrame) -> DataFrame forall a b. a -> (a -> b) -> b & Expr Bool -> DataFrame -> DataFrame filterWhere ( BinaryOp Double Double Bool -> Expr Double -> Expr Double -> Expr Bool forall c b a. (Columnable c, Columnable b, Columnable a) => BinaryOp c b a -> Expr c -> Expr b -> Expr a Binary ( MkBinaryOp { binaryFn :: Double -> Double -> Bool binaryFn = Double -> Double -> Bool forall a. Ord a => a -> a -> Bool (>=) , binaryName :: Text binaryName = Text "geq" , binarySymbol :: Maybe Text binarySymbol = Text -> Maybe Text forall a. a -> Maybe a Just Text ">=" , binaryCommutative :: Bool binaryCommutative = Bool False , binaryPrecedence :: Int binaryPrecedence = Int 1 } ) (forall a. Columnable a => Text -> Expr a Col @Double Text "__rand__") (Double -> Expr Double forall a. Columnable a => a -> Expr a Lit (Int -> Double forall a b. (Integral a, Num b) => a -> b fromIntegral Int n Double -> Double -> Double forall a. Num a => a -> a -> a * Double partitionSize)) ) go :: Int -> DataFrame -> [DataFrame] go (-1) DataFrame _ = [] go Int n DataFrame d = let d' :: DataFrame d' = Int -> DataFrame -> DataFrame singleFold Int n DataFrame d d'' :: DataFrame d'' = DataFrame d DataFrame -> (DataFrame -> DataFrame) -> DataFrame forall a b. a -> (a -> b) -> b & Expr Bool -> DataFrame -> DataFrame filterWhere ( BinaryOp Double Double Bool -> Expr Double -> Expr Double -> Expr Bool forall c b a. (Columnable c, Columnable b, Columnable a) => BinaryOp c b a -> Expr c -> Expr b -> Expr a Binary ( MkBinaryOp { binaryFn :: Double -> Double -> Bool binaryFn = Double -> Double -> Bool forall a. Ord a => a -> a -> Bool (<) , binaryName :: Text binaryName = Text "lt" , binarySymbol :: Maybe Text binarySymbol = Text -> Maybe Text forall a. a -> Maybe a Just Text "<" , binaryCommutative :: Bool binaryCommutative = Bool False , binaryPrecedence :: Int binaryPrecedence = Int 1 } ) (forall a. Columnable a => Text -> Expr a Col @Double Text "__rand__") (Double -> Expr Double forall a. Columnable a => a -> Expr a Lit (Int -> Double forall a b. (Integral a, Num b) => a -> b fromIntegral Int n Double -> Double -> Double forall a. Num a => a -> a -> a * Double partitionSize)) ) in DataFrame d' DataFrame -> [DataFrame] -> [DataFrame] forall a. a -> [a] -> [a] : Int -> DataFrame -> [DataFrame] go (Int n Int -> Int -> Int forall a. Num a => a -> a -> a - Int 1) DataFrame d'' in (DataFrame -> DataFrame) -> [DataFrame] -> [DataFrame] forall a b. (a -> b) -> [a] -> [b] map ([Text] -> DataFrame -> DataFrame exclude [Text "__rand__"]) (Int -> DataFrame -> [DataFrame] go (Int folds Int -> Int -> Int forall a. Num a => a -> a -> a - Int 1) DataFrame withRand) generateRandomVector :: (RandomGen g) => g -> Int -> VU.Vector Double generateRandomVector :: forall g. RandomGen g => g -> Int -> Vector Double generateRandomVector g pureGen Int k = [Double] -> Vector Double forall a. Unbox a => [a] -> Vector a VU.fromList ([Double] -> Vector Double) -> [Double] -> Vector Double forall a b. (a -> b) -> a -> b $ g -> Int -> [Double] forall {t} {t}. (Eq t, Num t, RandomGen t) => t -> t -> [Double] go g pureGen Int k where go :: t -> t -> [Double] go t g t 0 = [] go t g t n = let (Double v, t g') = (Double, Double) -> t -> (Double, t) forall a g. (UniformRange a, RandomGen g) => (a, a) -> g -> (a, g) uniformR (Double 0 :: Double, Double 1 :: Double) t g in Double v Double -> [Double] -> [Double] forall a. a -> [a] -> [a] : t -> t -> [Double] go t g' (t n t -> t -> t forall a. Num a => a -> a -> a - t 1) columnToTextVec :: Column -> V.Vector T.Text columnToTextVec :: Column -> Vector Text columnToTextVec (BoxedColumn (Vector a col :: V.Vector a)) = case TypeRep a -> TypeRep Text -> Maybe (a :~: Text) forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b) forall {k} (f :: k -> *) (a :: k) (b :: k). TestEquality f => f a -> f b -> Maybe (a :~: b) testEquality (forall a. Typeable a => TypeRep a forall {k} (a :: k). Typeable a => TypeRep a typeRep @a) (forall a. Typeable a => TypeRep a forall {k} (a :: k). Typeable a => TypeRep a typeRep @T.Text) of Just a :~: Text Refl -> Vector a Vector Text col Maybe (a :~: Text) Nothing -> (a -> Text) -> Vector a -> Vector Text forall a b. (a -> b) -> Vector a -> Vector b V.map (String -> Text T.pack (String -> Text) -> (a -> String) -> a -> Text forall b c a. (b -> c) -> (a -> b) -> a -> c . a -> String forall a. Show a => a -> String show) Vector a col columnToTextVec (UnboxedColumn Vector a col) = (a -> Text) -> Vector a -> Vector Text forall a b. (a -> b) -> Vector a -> Vector b V.map (String -> Text T.pack (String -> Text) -> (a -> String) -> a -> Text forall b c a. (b -> c) -> (a -> b) -> a -> c . a -> String forall a. Show a => a -> String show) (Vector a -> Vector a forall (v :: * -> *) a (w :: * -> *). (Vector v a, Vector w a) => v a -> w a V.convert Vector a col) columnToTextVec (OptionalColumn Vector (Maybe a) col) = (Maybe a -> Text) -> Vector (Maybe a) -> Vector Text forall a b. (a -> b) -> Vector a -> Vector b V.map (String -> Text T.pack (String -> Text) -> (Maybe a -> String) -> Maybe a -> Text forall b c a. (b -> c) -> (a -> b) -> a -> c . Maybe a -> String forall a. Show a => a -> String show) Vector (Maybe a) col groupByIndices :: Column -> M.Map T.Text (VU.Vector Int) groupByIndices :: Column -> Map Text (Vector Int) groupByIndices Column col = let textVec :: Vector Text textVec = Column -> Vector Text columnToTextVec Column col (Map Text [Int] grouped, Int _) = ((Map Text [Int], Int) -> Text -> (Map Text [Int], Int)) -> (Map Text [Int], Int) -> Vector Text -> (Map Text [Int], Int) forall a b. (a -> b -> a) -> a -> Vector b -> a V.foldl' (\(!Map Text [Int] m, !Int i) Text key -> (([Int] -> [Int] -> [Int]) -> Text -> [Int] -> Map Text [Int] -> Map Text [Int] forall k a. Ord k => (a -> a -> a) -> k -> a -> Map k a -> Map k a M.insertWith [Int] -> [Int] -> [Int] forall a. [a] -> [a] -> [a] (++) Text key [Int i] Map Text [Int] m, Int i Int -> Int -> Int forall a. Num a => a -> a -> a + Int 1)) (Map Text [Int] forall k a. Map k a M.empty, Int 0) Vector Text textVec in ([Int] -> Vector Int) -> Map Text [Int] -> Map Text (Vector Int) forall a b k. (a -> b) -> Map k a -> Map k b M.map ([Int] -> Vector Int forall a. Unbox a => [a] -> Vector a VU.fromList ([Int] -> Vector Int) -> ([Int] -> [Int]) -> [Int] -> Vector Int forall b c a. (b -> c) -> (a -> b) -> a -> c . [Int] -> [Int] forall a. [a] -> [a] L.reverse) Map Text [Int] grouped rowsAtIndices :: VU.Vector Int -> DataFrame -> DataFrame rowsAtIndices :: Vector Int -> DataFrame -> DataFrame rowsAtIndices Vector Int ixs DataFrame df = DataFrame df { columns = V.map (atIndicesStable ixs) (columns df) , dataframeDimensions = (VU.length ixs, snd (dataframeDimensions df)) } stratifiedSample :: forall a g. (SplitGen g, RandomGen g, Columnable a) => g -> Double -> Expr a -> DataFrame -> DataFrame stratifiedSample :: forall a g. (SplitGen g, RandomGen g, Columnable a) => g -> Double -> Expr a -> DataFrame -> DataFrame stratifiedSample g gen Double p Expr a strataCol DataFrame df = let col :: Column col = case Expr a strataCol of Col Text name -> Text -> DataFrame -> Column unsafeGetColumn Text name DataFrame df Expr a _ -> TypedColumn a -> Column forall a. TypedColumn a -> Column unwrapTypedColumn ((DataFrameException -> TypedColumn a) -> (TypedColumn a -> TypedColumn a) -> Either DataFrameException (TypedColumn a) -> TypedColumn a forall a c b. (a -> c) -> (b -> c) -> Either a b -> c either DataFrameException -> TypedColumn a forall a e. Exception e => e -> a throw TypedColumn a -> TypedColumn a forall a. a -> a id (forall a. Columnable a => DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) interpret @a DataFrame df Expr a strataCol)) groups :: [Vector Int] groups = Map Text (Vector Int) -> [Vector Int] forall k a. Map k a -> [a] M.elems (Column -> Map Text (Vector Int) groupByIndices Column col) go :: g -> [Vector Int] -> DataFrame go g _ [] = DataFrame forall a. Monoid a => a mempty go g g (Vector Int ixs : [Vector Int] rest) = let stratum :: DataFrame stratum = Vector Int -> DataFrame -> DataFrame rowsAtIndices Vector Int ixs DataFrame df (g g1, g g2) = g -> (g, g) forall g. SplitGen g => g -> (g, g) splitGen g g in g -> Double -> DataFrame -> DataFrame forall g. RandomGen g => g -> Double -> DataFrame -> DataFrame sample g g1 Double p DataFrame stratum DataFrame -> DataFrame -> DataFrame forall a. Semigroup a => a -> a -> a <> g -> [Vector Int] -> DataFrame go g g2 [Vector Int] rest in g -> [Vector Int] -> DataFrame go g gen [Vector Int] groups stratifiedSplit :: forall a g. (SplitGen g, RandomGen g, Columnable a) => g -> Double -> Expr a -> DataFrame -> (DataFrame, DataFrame) stratifiedSplit :: forall a g. (SplitGen g, RandomGen g, Columnable a) => g -> Double -> Expr a -> DataFrame -> (DataFrame, DataFrame) stratifiedSplit g gen Double p Expr a strataCol DataFrame df = let col :: Column col = case Expr a strataCol of Col Text name -> Text -> DataFrame -> Column unsafeGetColumn Text name DataFrame df Expr a _ -> TypedColumn a -> Column forall a. TypedColumn a -> Column unwrapTypedColumn ((DataFrameException -> TypedColumn a) -> (TypedColumn a -> TypedColumn a) -> Either DataFrameException (TypedColumn a) -> TypedColumn a forall a c b. (a -> c) -> (b -> c) -> Either a b -> c either DataFrameException -> TypedColumn a forall a e. Exception e => e -> a throw TypedColumn a -> TypedColumn a forall a. a -> a id (forall a. Columnable a => DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) interpret @a DataFrame df Expr a strataCol)) groups :: [Vector Int] groups = Map Text (Vector Int) -> [Vector Int] forall k a. Map k a -> [a] M.elems (Column -> Map Text (Vector Int) groupByIndices Column col) go :: g -> [Vector Int] -> (DataFrame, DataFrame) go g _ [] = (DataFrame forall a. Monoid a => a mempty, DataFrame forall a. Monoid a => a mempty) go g g (Vector Int ixs : [Vector Int] rest) = let stratum :: DataFrame stratum = Vector Int -> DataFrame -> DataFrame rowsAtIndices Vector Int ixs DataFrame df (g g1, g g2) = g -> (g, g) forall g. SplitGen g => g -> (g, g) splitGen g g (DataFrame tr, DataFrame va) = g -> Double -> DataFrame -> (DataFrame, DataFrame) forall g. RandomGen g => g -> Double -> DataFrame -> (DataFrame, DataFrame) randomSplit g g1 Double p DataFrame stratum (DataFrame trAcc, DataFrame vaAcc) = g -> [Vector Int] -> (DataFrame, DataFrame) go g g2 [Vector Int] rest in (DataFrame tr DataFrame -> DataFrame -> DataFrame forall a. Semigroup a => a -> a -> a <> DataFrame trAcc, DataFrame va DataFrame -> DataFrame -> DataFrame forall a. Semigroup a => a -> a -> a <> DataFrame vaAcc) in g -> [Vector Int] -> (DataFrame, DataFrame) go g gen [Vector Int] groups