{-# LANGUAGE ExplicitNamespaces #-} {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE GADTs #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE RankNTypes #-} {-# LANGUAGE ScopedTypeVariables #-} {-# LANGUAGE TypeApplications #-} {-# LANGUAGE UndecidableInstances #-} module DataFrame.Operations.Statistics where import qualified Data.List as L import qualified Data.Map as M import qualified Data.Text as T import qualified Data.Vector as V import qualified Data.Vector.Generic as VG import qualified Data.Vector.Unboxed as VU import Prelude as P import Control.Exception (throw) import Data.Function ((&)) import Data.Maybe (fromMaybe, isJust) import Data.Type.Equality (TestEquality (testEquality), type (:~:) (Refl)) import DataFrame.Errors (DataFrameException (..)) import DataFrame.Internal.Column import DataFrame.Internal.DataFrame ( DataFrame (..), empty, getColumn, ) import DataFrame.Internal.Expression import DataFrame.Internal.Interpreter import DataFrame.Internal.Nullable (BaseType) import DataFrame.Internal.Row (showValue, toAny) import DataFrame.Internal.Statistics import DataFrame.Internal.Types import DataFrame.Operations.Core import DataFrame.Operations.Subset (filterJust) import DataFrame.Operations.Transformations (ImputeOp (..), imputeCore) import Text.Printf (printf) import Type.Reflection (typeRep) frequencies :: forall a. (Columnable a) => Expr a -> DataFrame -> DataFrame frequencies :: forall a. Columnable a => Expr a -> DataFrame -> DataFrame frequencies Expr a expr DataFrame df = let counts :: [(a, Int)] counts = Expr a -> DataFrame -> [(a, Int)] forall a. (Ord a, Columnable a) => Expr a -> DataFrame -> [(a, Int)] valueCounts Expr a expr DataFrame df calculatePercentage :: [(a, a)] -> a -> Any calculatePercentage [(a, a)] cs a k = String -> Any forall a. Columnable a => a -> Any toAny (String -> Any) -> String -> Any forall a b. (a -> b) -> a -> b $ Double -> String toPct2dp (a -> Double forall a b. (Integral a, Num b) => a -> b fromIntegral a k Double -> Double -> Double forall a. Fractional a => a -> a -> a / a -> Double forall a b. (Integral a, Num b) => a -> b fromIntegral ([a] -> a forall a. Num a => [a] -> a forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a P.sum ([a] -> a) -> [a] -> a forall a b. (a -> b) -> a -> b $ ((a, a) -> a) -> [(a, a)] -> [a] forall a b. (a -> b) -> [a] -> [b] map (a, a) -> a forall a b. (a, b) -> b snd [(a, a)] cs)) initDf :: DataFrame initDf = DataFrame empty DataFrame -> (DataFrame -> DataFrame) -> DataFrame forall a b. a -> (a -> b) -> b & Text -> Vector Text -> DataFrame -> DataFrame forall a. Columnable a => Text -> Vector a -> DataFrame -> DataFrame insertVector Text "Statistic" ([Text] -> Vector Text forall a. [a] -> Vector a V.fromList [Text "Count" :: T.Text, Text "Percentage (%)"]) freqs :: Vector a -> DataFrame freqs Vector a col = (DataFrame -> (a, Int) -> DataFrame) -> DataFrame -> [(a, Int)] -> DataFrame forall b a. (b -> a -> b) -> b -> [a] -> b forall (t :: * -> *) b a. Foldable t => (b -> a -> b) -> b -> t a -> b L.foldl' ( \DataFrame d (a col, Int k) -> Text -> Vector Any -> DataFrame -> DataFrame forall a. Columnable a => Text -> Vector a -> DataFrame -> DataFrame insertVector (forall a. Columnable a => a -> Text showValue @a a col) ([Any] -> Vector Any forall a. [a] -> Vector a V.fromList [Int -> Any forall a. Columnable a => a -> Any toAny Int k, [(a, Int)] -> Int -> Any forall {a} {a} {a}. (Integral a, Integral a) => [(a, a)] -> a -> Any calculatePercentage [(a, Int)] counts Int k]) DataFrame d ) DataFrame initDf [(a, Int)] counts in case Expr a -> DataFrame -> Either DataFrameException (Vector a) forall a. Columnable a => Expr a -> DataFrame -> Either DataFrameException (Vector a) columnAsVector Expr a expr DataFrame df of Left DataFrameException err -> DataFrameException -> DataFrame forall a e. Exception e => e -> a throw DataFrameException err Right Vector a column -> Vector a -> DataFrame freqs Vector a column mean :: forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double mean :: forall a. (Columnable a, Real a, Unbox a) => Expr a -> DataFrame -> Double mean (Col Text name) DataFrame df = case Text -> DataFrame -> Maybe (Vector Double) _getColumnAsDouble Text name DataFrame df of Just Vector Double xs -> Vector Double -> Double meanDouble' Vector Double xs Maybe (Vector Double) Nothing -> String -> Double forall a. HasCallStack => String -> a error String "[INTERNAL ERROR] Column is non-numeric" mean Expr a expr DataFrame df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) forall a. Columnable a => DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) interpret DataFrame df Expr a expr of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right (TColumn Column col) -> case forall a. (Columnable a, Unbox a) => Column -> Either DataFrameException (Vector a) toUnboxedVector @a Column col of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right Vector a xs -> Vector a -> Double forall a. (Real a, Unbox a) => Vector a -> Double mean' Vector a xs meanMaybe :: forall a. (Columnable a, Real a) => Expr (Maybe a) -> DataFrame -> Double meanMaybe :: forall a. (Columnable a, Real a) => Expr (Maybe a) -> DataFrame -> Double meanMaybe (Col Text name) DataFrame df = (Vector Double -> Double forall a. (Real a, Unbox a) => Vector a -> Double mean' (Vector Double -> Double) -> (Vector (Maybe a) -> Vector Double) -> Vector (Maybe a) -> Double forall b c a. (b -> c) -> (a -> b) -> a -> c . Vector (Maybe a) -> Vector Double forall a. Real a => Vector (Maybe a) -> Vector Double optionalToDoubleVector) ((DataFrameException -> Vector (Maybe a)) -> (Vector (Maybe a) -> Vector (Maybe a)) -> Either DataFrameException (Vector (Maybe a)) -> Vector (Maybe a) forall a c b. (a -> c) -> (b -> c) -> Either a b -> c either DataFrameException -> Vector (Maybe a) forall a e. Exception e => e -> a throw Vector (Maybe a) -> Vector (Maybe a) forall a. a -> a id (Expr (Maybe a) -> DataFrame -> Either DataFrameException (Vector (Maybe a)) forall a. Columnable a => Expr a -> DataFrame -> Either DataFrameException (Vector a) columnAsVector (forall a. Columnable a => Text -> Expr a Col @(Maybe a) Text name) DataFrame df)) meanMaybe Expr (Maybe a) expr DataFrame df = case forall a. Columnable a => DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) interpret @(Maybe a) DataFrame df Expr (Maybe a) expr of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right (TColumn Column col) -> case forall a (v :: * -> *). (Vector v a, Columnable a) => Column -> Either DataFrameException (v a) toVector @(Maybe a) Column col of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right Vector (Maybe a) xs -> (Vector Double -> Double forall a. (Real a, Unbox a) => Vector a -> Double mean' (Vector Double -> Double) -> (Vector (Maybe a) -> Vector Double) -> Vector (Maybe a) -> Double forall b c a. (b -> c) -> (a -> b) -> a -> c . Vector (Maybe a) -> Vector Double forall a. Real a => Vector (Maybe a) -> Vector Double optionalToDoubleVector) Vector (Maybe a) xs median :: forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double median :: forall a. (Columnable a, Real a, Unbox a) => Expr a -> DataFrame -> Double median (Col Text name) DataFrame df = case Expr a -> DataFrame -> Either DataFrameException (Vector a) forall a. (Columnable a, Unbox a) => Expr a -> DataFrame -> Either DataFrameException (Vector a) columnAsUnboxedVector (forall a. Columnable a => Text -> Expr a Col @a Text name) DataFrame df of Right Vector a xs -> Vector a -> Double forall a. (Real a, Unbox a) => Vector a -> Double median' Vector a xs Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e median Expr a expr DataFrame df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) forall a. Columnable a => DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) interpret DataFrame df Expr a expr of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right (TColumn Column col) -> case forall a. (Columnable a, Unbox a) => Column -> Either DataFrameException (Vector a) toUnboxedVector @a Column col of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right Vector a xs -> Vector a -> Double forall a. (Real a, Unbox a) => Vector a -> Double median' Vector a xs medianMaybe :: forall a. (Columnable a, Real a) => Expr (Maybe a) -> DataFrame -> Double medianMaybe :: forall a. (Columnable a, Real a) => Expr (Maybe a) -> DataFrame -> Double medianMaybe (Col Text name) DataFrame df = (Vector Double -> Double forall a. (Real a, Unbox a) => Vector a -> Double median' (Vector Double -> Double) -> (Vector (Maybe a) -> Vector Double) -> Vector (Maybe a) -> Double forall b c a. (b -> c) -> (a -> b) -> a -> c . Vector (Maybe a) -> Vector Double forall a. Real a => Vector (Maybe a) -> Vector Double optionalToDoubleVector) ((DataFrameException -> Vector (Maybe a)) -> (Vector (Maybe a) -> Vector (Maybe a)) -> Either DataFrameException (Vector (Maybe a)) -> Vector (Maybe a) forall a c b. (a -> c) -> (b -> c) -> Either a b -> c either DataFrameException -> Vector (Maybe a) forall a e. Exception e => e -> a throw Vector (Maybe a) -> Vector (Maybe a) forall a. a -> a id (Expr (Maybe a) -> DataFrame -> Either DataFrameException (Vector (Maybe a)) forall a. Columnable a => Expr a -> DataFrame -> Either DataFrameException (Vector a) columnAsVector (forall a. Columnable a => Text -> Expr a Col @(Maybe a) Text name) DataFrame df)) medianMaybe Expr (Maybe a) expr DataFrame df = case forall a. Columnable a => DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) interpret @(Maybe a) DataFrame df Expr (Maybe a) expr of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right (TColumn Column col) -> case forall a (v :: * -> *). (Vector v a, Columnable a) => Column -> Either DataFrameException (v a) toVector @(Maybe a) Column col of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right Vector (Maybe a) xs -> (Vector Double -> Double forall a. (Real a, Unbox a) => Vector a -> Double median' (Vector Double -> Double) -> (Vector (Maybe a) -> Vector Double) -> Vector (Maybe a) -> Double forall b c a. (b -> c) -> (a -> b) -> a -> c . Vector (Maybe a) -> Vector Double forall a. Real a => Vector (Maybe a) -> Vector Double optionalToDoubleVector) Vector (Maybe a) xs percentile :: forall a. (Columnable a, Real a, VU.Unbox a) => Int -> Expr a -> DataFrame -> Double percentile :: forall a. (Columnable a, Real a, Unbox a) => Int -> Expr a -> DataFrame -> Double percentile Int n (Col Text name) DataFrame df = case Expr a -> DataFrame -> Either DataFrameException (Vector a) forall a. (Columnable a, Unbox a) => Expr a -> DataFrame -> Either DataFrameException (Vector a) columnAsUnboxedVector (forall a. Columnable a => Text -> Expr a Col @a Text name) DataFrame df of Right Vector a xs -> Int -> Vector a -> Double forall a. (Unbox a, Num a, Real a) => Int -> Vector a -> Double percentile' Int n Vector a xs Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e percentile Int n Expr a expr DataFrame df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) forall a. Columnable a => DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) interpret DataFrame df Expr a expr of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right (TColumn Column col) -> case forall a. (Columnable a, Unbox a) => Column -> Either DataFrameException (Vector a) toUnboxedVector @a Column col of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right Vector a xs -> Int -> Vector a -> Double forall a. (Unbox a, Num a, Real a) => Int -> Vector a -> Double percentile' Int n Vector a xs genericPercentile :: forall a. (Columnable a, Ord a) => Int -> Expr a -> DataFrame -> a genericPercentile :: forall a. (Columnable a, Ord a) => Int -> Expr a -> DataFrame -> a genericPercentile Int n (Col Text name) DataFrame df = case Expr a -> DataFrame -> Either DataFrameException (Vector a) forall a. Columnable a => Expr a -> DataFrame -> Either DataFrameException (Vector a) columnAsVector (forall a. Columnable a => Text -> Expr a Col @a Text name) DataFrame df of Right Vector a xs -> Int -> Vector a -> a forall a. (Ord a, Eq a) => Int -> Vector a -> a percentileOrd' Int n Vector a xs Left DataFrameException e -> DataFrameException -> a forall a e. Exception e => e -> a throw DataFrameException e genericPercentile Int n Expr a expr DataFrame df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) forall a. Columnable a => DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) interpret DataFrame df Expr a expr of Left DataFrameException e -> DataFrameException -> a forall a e. Exception e => e -> a throw DataFrameException e Right (TColumn Column col) -> case forall a (v :: * -> *). (Vector v a, Columnable a) => Column -> Either DataFrameException (v a) toVector @a Column col of Left DataFrameException e -> DataFrameException -> a forall a e. Exception e => e -> a throw DataFrameException e Right Vector a xs -> Int -> Vector a -> a forall a. (Ord a, Eq a) => Int -> Vector a -> a percentileOrd' Int n Vector a xs standardDeviation :: forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double standardDeviation :: forall a. (Columnable a, Real a, Unbox a) => Expr a -> DataFrame -> Double standardDeviation (Col Text name) DataFrame df = case Expr a -> DataFrame -> Either DataFrameException (Vector a) forall a. (Columnable a, Unbox a) => Expr a -> DataFrame -> Either DataFrameException (Vector a) columnAsUnboxedVector (forall a. Columnable a => Text -> Expr a Col @a Text name) DataFrame df of Right Vector a xs -> (Double -> Double forall a. Floating a => a -> a sqrt (Double -> Double) -> (Vector a -> Double) -> Vector a -> Double forall b c a. (b -> c) -> (a -> b) -> a -> c . Vector a -> Double forall a. (Real a, Unbox a) => Vector a -> Double variance') Vector a xs Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e standardDeviation Expr a expr DataFrame df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) forall a. Columnable a => DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) interpret DataFrame df Expr a expr of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right (TColumn Column col) -> case forall a. (Columnable a, Unbox a) => Column -> Either DataFrameException (Vector a) toUnboxedVector @a Column col of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right Vector a xs -> (Double -> Double forall a. Floating a => a -> a sqrt (Double -> Double) -> (Vector a -> Double) -> Vector a -> Double forall b c a. (b -> c) -> (a -> b) -> a -> c . Vector a -> Double forall a. (Real a, Unbox a) => Vector a -> Double variance') Vector a xs skewness :: forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double skewness :: forall a. (Columnable a, Real a, Unbox a) => Expr a -> DataFrame -> Double skewness (Col Text name) DataFrame df = case Expr a -> DataFrame -> Either DataFrameException (Vector a) forall a. (Columnable a, Unbox a) => Expr a -> DataFrame -> Either DataFrameException (Vector a) columnAsUnboxedVector (forall a. Columnable a => Text -> Expr a Col @a Text name) DataFrame df of Right Vector a xs -> Vector a -> Double forall a. (Unbox a, Real a, Num a) => Vector a -> Double skewness' Vector a xs Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e skewness Expr a expr DataFrame df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) forall a. Columnable a => DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) interpret DataFrame df Expr a expr of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right (TColumn Column col) -> case forall a. (Columnable a, Unbox a) => Column -> Either DataFrameException (Vector a) toUnboxedVector @a Column col of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right Vector a xs -> Vector a -> Double forall a. (Unbox a, Real a, Num a) => Vector a -> Double skewness' Vector a xs variance :: forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double variance :: forall a. (Columnable a, Real a, Unbox a) => Expr a -> DataFrame -> Double variance (Col Text name) DataFrame df = case Text -> DataFrame -> Maybe (Vector Double) _getColumnAsDouble Text name DataFrame df of Just Vector Double xs -> Vector Double -> Double varianceDouble' Vector Double xs Maybe (Vector Double) Nothing -> String -> Double forall a. HasCallStack => String -> a error String "[INTERNAL ERROR] Column is non-numeric" variance Expr a expr DataFrame df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) forall a. Columnable a => DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) interpret DataFrame df Expr a expr of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right (TColumn Column col) -> case forall a. (Columnable a, Unbox a) => Column -> Either DataFrameException (Vector a) toUnboxedVector @a Column col of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right Vector a xs -> Vector a -> Double forall a. (Real a, Unbox a) => Vector a -> Double variance' Vector a xs interQuartileRange :: forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double interQuartileRange :: forall a. (Columnable a, Real a, Unbox a) => Expr a -> DataFrame -> Double interQuartileRange (Col Text name) DataFrame df = case Expr a -> DataFrame -> Either DataFrameException (Vector a) forall a. (Columnable a, Unbox a) => Expr a -> DataFrame -> Either DataFrameException (Vector a) columnAsUnboxedVector (forall a. Columnable a => Text -> Expr a Col @a Text name) DataFrame df of Right Vector a xs -> Vector a -> Double forall a. (Unbox a, Num a, Real a) => Vector a -> Double interQuartileRange' Vector a xs Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e interQuartileRange Expr a expr DataFrame df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) forall a. Columnable a => DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) interpret DataFrame df Expr a expr of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right (TColumn Column col) -> case forall a. (Columnable a, Unbox a) => Column -> Either DataFrameException (Vector a) toUnboxedVector @a Column col of Left DataFrameException e -> DataFrameException -> Double forall a e. Exception e => e -> a throw DataFrameException e Right Vector a xs -> Vector a -> Double forall a. (Unbox a, Num a, Real a) => Vector a -> Double interQuartileRange' Vector a xs correlation :: T.Text -> T.Text -> DataFrame -> Maybe Double correlation :: Text -> Text -> DataFrame -> Maybe Double correlation Text first Text second DataFrame df = do Vector Double f <- Text -> DataFrame -> Maybe (Vector Double) _getColumnAsDouble Text first DataFrame df Vector Double s <- Text -> DataFrame -> Maybe (Vector Double) _getColumnAsDouble Text second DataFrame df Vector Double -> Vector Double -> Maybe Double correlation' Vector Double f Vector Double s _getColumnAsDouble :: T.Text -> DataFrame -> Maybe (VU.Vector Double) _getColumnAsDouble :: Text -> DataFrame -> Maybe (Vector Double) _getColumnAsDouble Text name DataFrame df = case Text -> DataFrame -> Maybe Column getColumn Text name DataFrame df of Just (UnboxedColumn (Vector a f :: VU.Vector a)) -> case TypeRep a -> TypeRep Double -> Maybe (a :~: Double) forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b) forall {k} (f :: k -> *) (a :: k) (b :: k). TestEquality f => f a -> f b -> Maybe (a :~: b) testEquality (forall a. Typeable a => TypeRep a forall {k} (a :: k). Typeable a => TypeRep a typeRep @a) (forall a. Typeable a => TypeRep a forall {k} (a :: k). Typeable a => TypeRep a typeRep @Double) of Just a :~: Double Refl -> Vector Double -> Maybe (Vector Double) forall a. a -> Maybe a Just Vector a Vector Double f Maybe (a :~: Double) Nothing -> case forall a. SBoolI (IntegralTypes a) => SBool (IntegralTypes a) sIntegral @a of SBool (IntegralTypes a) STrue -> Vector Double -> Maybe (Vector Double) forall a. a -> Maybe a Just ((a -> Double) -> Vector a -> Vector Double forall a b. (Unbox a, Unbox b) => (a -> b) -> Vector a -> Vector b VU.map a -> Double forall a b. (Integral a, Num b) => a -> b fromIntegral Vector a f) SBool (IntegralTypes a) SFalse -> case forall a. SBoolI (FloatingTypes a) => SBool (FloatingTypes a) sFloating @a of SBool (FloatingTypes a) STrue -> Vector Double -> Maybe (Vector Double) forall a. a -> Maybe a Just ((a -> Double) -> Vector a -> Vector Double forall a b. (Unbox a, Unbox b) => (a -> b) -> Vector a -> Vector b VU.map a -> Double forall a b. (Real a, Fractional b) => a -> b realToFrac Vector a f) SBool (FloatingTypes a) SFalse -> Maybe (Vector Double) forall a. Maybe a Nothing Maybe Column Nothing -> DataFrameException -> Maybe (Vector Double) forall a e. Exception e => e -> a throw (DataFrameException -> Maybe (Vector Double)) -> DataFrameException -> Maybe (Vector Double) forall a b. (a -> b) -> a -> b $ Text -> Text -> [Text] -> DataFrameException ColumnNotFoundException Text name Text "_getColumnAsDouble" (Map Text Int -> [Text] forall k a. Map k a -> [k] M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text] forall a b. (a -> b) -> a -> b $ DataFrame -> Map Text Int columnIndices DataFrame df) Maybe Column _ -> Maybe (Vector Double) forall a. Maybe a Nothing {-# INLINE _getColumnAsDouble #-} optionalToDoubleVector :: (Real a) => V.Vector (Maybe a) -> VU.Vector Double optionalToDoubleVector :: forall a. Real a => Vector (Maybe a) -> Vector Double optionalToDoubleVector = [Double] -> Vector Double forall a. Unbox a => [a] -> Vector a VU.fromList ([Double] -> Vector Double) -> (Vector (Maybe a) -> [Double]) -> Vector (Maybe a) -> Vector Double forall b c a. (b -> c) -> (a -> b) -> a -> c . ([Double] -> Maybe a -> [Double]) -> [Double] -> Vector (Maybe a) -> [Double] forall a b. (a -> b -> a) -> a -> Vector b -> a V.foldl' (\[Double] acc Maybe a e -> if Maybe a -> Bool forall a. Maybe a -> Bool isJust Maybe a e then a -> Double forall a b. (Real a, Fractional b) => a -> b realToFrac (a -> Maybe a -> a forall a. a -> Maybe a -> a fromMaybe a 0 Maybe a e) Double -> [Double] -> [Double] forall a. a -> [a] -> [a] : [Double] acc else [Double] acc) [] sum :: forall a. (Columnable a, Num a) => Expr a -> DataFrame -> a sum :: forall a. (Columnable a, Num a) => Expr a -> DataFrame -> a sum (Col Text name) DataFrame df = case Text -> DataFrame -> Maybe Column getColumn Text name DataFrame df of Maybe Column Nothing -> DataFrameException -> a forall a e. Exception e => e -> a throw (DataFrameException -> a) -> DataFrameException -> a forall a b. (a -> b) -> a -> b $ Text -> Text -> [Text] -> DataFrameException ColumnNotFoundException Text name Text "sum" (Map Text Int -> [Text] forall k a. Map k a -> [k] M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text] forall a b. (a -> b) -> a -> b $ DataFrame -> Map Text Int columnIndices DataFrame df) Just ((UnboxedColumn (Vector a column :: VU.Vector a'))) -> case TypeRep a -> TypeRep a -> Maybe (a :~: a) forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b) forall {k} (f :: k -> *) (a :: k) (b :: k). TestEquality f => f a -> f b -> Maybe (a :~: b) testEquality (forall a. Typeable a => TypeRep a forall {k} (a :: k). Typeable a => TypeRep a typeRep @a') (forall a. Typeable a => TypeRep a forall {k} (a :: k). Typeable a => TypeRep a typeRep @a) of Just a :~: a Refl -> Vector a -> a forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a VG.sum Vector a Vector a column Maybe (a :~: a) Nothing -> a 0 Just ((BoxedColumn (Vector a column :: V.Vector a'))) -> case TypeRep a -> TypeRep a -> Maybe (a :~: a) forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b) forall {k} (f :: k -> *) (a :: k) (b :: k). TestEquality f => f a -> f b -> Maybe (a :~: b) testEquality (forall a. Typeable a => TypeRep a forall {k} (a :: k). Typeable a => TypeRep a typeRep @a') (forall a. Typeable a => TypeRep a forall {k} (a :: k). Typeable a => TypeRep a typeRep @a) of Just a :~: a Refl -> Vector a -> a forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a VG.sum Vector a Vector a column Maybe (a :~: a) Nothing -> a 0 Just ((OptionalColumn (Vector (Maybe a) column :: V.Vector (Maybe a')))) -> case TypeRep a -> TypeRep a -> Maybe (a :~: a) forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b) forall {k} (f :: k -> *) (a :: k) (b :: k). TestEquality f => f a -> f b -> Maybe (a :~: b) testEquality (forall a. Typeable a => TypeRep a forall {k} (a :: k). Typeable a => TypeRep a typeRep @a') (forall a. Typeable a => TypeRep a forall {k} (a :: k). Typeable a => TypeRep a typeRep @a) of Just a :~: a Refl -> Vector a -> a forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a VG.sum ((Maybe a -> a) -> Vector (Maybe a) -> Vector a forall (v :: * -> *) a b. (Vector v a, Vector v b) => (a -> b) -> v a -> v b VG.map (a -> Maybe a -> a forall a. a -> Maybe a -> a fromMaybe a 0) Vector (Maybe a) Vector (Maybe a) column) Maybe (a :~: a) Nothing -> a 0 sum Expr a expr DataFrame df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) forall a. Columnable a => DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) interpret DataFrame df Expr a expr of Left DataFrameException e -> DataFrameException -> a forall a e. Exception e => e -> a throw DataFrameException e Right (TColumn Column xs) -> case forall a (v :: * -> *). (Vector v a, Columnable a) => Column -> Either DataFrameException (v a) toVector @a @V.Vector Column xs of Left DataFrameException e -> DataFrameException -> a forall a e. Exception e => e -> a throw DataFrameException e Right Vector a xs -> Vector a -> a forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a VG.sum Vector a xs instance {-# OVERLAPPING #-} (Columnable b) => ImputeOp (Maybe b) where runImpute :: Expr (Maybe b) -> BaseType (Maybe b) -> DataFrame -> DataFrame runImpute = Expr (Maybe b) -> b -> DataFrame -> DataFrame Expr (Maybe b) -> BaseType (Maybe b) -> DataFrame -> DataFrame forall b. Columnable b => Expr (Maybe b) -> b -> DataFrame -> DataFrame imputeCore runImputeWith :: Columnable (BaseType (Maybe b)) => (Expr (BaseType (Maybe b)) -> Expr (BaseType (Maybe b))) -> Expr (Maybe b) -> DataFrame -> DataFrame runImputeWith Expr (BaseType (Maybe b)) -> Expr (BaseType (Maybe b)) f col :: Expr (Maybe b) col@(Col Text columnName) DataFrame df = case forall a. Columnable a => DataFrame -> Expr a -> Either DataFrameException (TypedColumn a) interpret @b (Text -> DataFrame -> DataFrame filterJust Text columnName DataFrame df) (Expr (BaseType (Maybe b)) -> Expr (BaseType (Maybe b)) f (forall a. Columnable a => Text -> Expr a Col @b Text columnName)) of Left DataFrameException e -> DataFrameException -> DataFrame forall a e. Exception e => e -> a throw DataFrameException e Right (TColumn Column value) -> case forall a. Columnable a => Column -> Either DataFrameException a headColumn @b Column value of Left DataFrameException e -> DataFrameException -> DataFrame forall a e. Exception e => e -> a throw DataFrameException e Right b h -> if (b -> Bool) -> [b] -> Bool forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool all (b -> b -> Bool forall a. Eq a => a -> a -> Bool == b h) (forall a. Columnable a => Column -> [a] toList @b Column value) then Expr (Maybe b) -> b -> DataFrame -> DataFrame forall b. Columnable b => Expr (Maybe b) -> b -> DataFrame -> DataFrame imputeCore Expr (Maybe b) col b h DataFrame df else String -> DataFrame forall a. HasCallStack => String -> a error String "Impute expression returned more than one value" runImputeWith Expr (BaseType (Maybe b)) -> Expr (BaseType (Maybe b)) _ Expr (Maybe b) _ DataFrame df = DataFrame df imputeWith :: forall a. (ImputeOp a, Columnable (BaseType a)) => (Expr (BaseType a) -> Expr (BaseType a)) -> Expr a -> DataFrame -> DataFrame imputeWith :: forall a. (ImputeOp a, Columnable (BaseType a)) => (Expr (BaseType a) -> Expr (BaseType a)) -> Expr a -> DataFrame -> DataFrame imputeWith = (Expr (BaseType a) -> Expr (BaseType a)) -> Expr a -> DataFrame -> DataFrame forall a. (ImputeOp a, Columnable (BaseType a)) => (Expr (BaseType a) -> Expr (BaseType a)) -> Expr a -> DataFrame -> DataFrame runImputeWith applyStatistic :: (VU.Vector Double -> Double) -> T.Text -> DataFrame -> Maybe Double applyStatistic :: (Vector Double -> Double) -> Text -> DataFrame -> Maybe Double applyStatistic Vector Double -> Double f Text name DataFrame df = Vector Double -> Maybe Double apply (Vector Double -> Maybe Double) -> Maybe (Vector Double) -> Maybe Double forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b =<< Text -> DataFrame -> Maybe (Vector Double) _getColumnAsDouble Text name (Text -> DataFrame -> DataFrame filterJust Text name DataFrame df) where apply :: Vector Double -> Maybe Double apply Vector Double col = let res :: Double res = Vector Double -> Double f Vector Double col in if Double -> Bool forall a. RealFloat a => a -> Bool isNaN Double res then Maybe Double forall a. Maybe a Nothing else Double -> Maybe Double forall a. a -> Maybe a forall (f :: * -> *) a. Applicative f => a -> f a pure Double res {-# INLINE applyStatistic #-} applyStatistics :: (VU.Vector Double -> VU.Vector Double) -> T.Text -> DataFrame -> Maybe (VU.Vector Double) applyStatistics :: (Vector Double -> Vector Double) -> Text -> DataFrame -> Maybe (Vector Double) applyStatistics Vector Double -> Vector Double f Text name DataFrame df = (Vector Double -> Vector Double) -> Maybe (Vector Double) -> Maybe (Vector Double) forall a b. (a -> b) -> Maybe a -> Maybe b forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b fmap Vector Double -> Vector Double f (Text -> DataFrame -> Maybe (Vector Double) _getColumnAsDouble Text name (Text -> DataFrame -> DataFrame filterJust Text name DataFrame df)) summarize :: DataFrame -> DataFrame summarize :: DataFrame -> DataFrame summarize DataFrame df = (Text -> DataFrame -> DataFrame) -> [Text] -> DataFrame -> DataFrame forall a. (a -> DataFrame -> DataFrame) -> [a] -> DataFrame -> DataFrame fold Text -> DataFrame -> DataFrame columnStats (DataFrame -> [Text] columnNames DataFrame df) ( [(Text, Column)] -> DataFrame fromNamedColumns [ ( Text "Statistic" , [Text] -> Column forall a. (Columnable a, ColumnifyRep (KindOf a) a) => [a] -> Column fromList [ Text "Count" :: T.Text , Text "Mean" , Text "Minimum" , Text "25%" , Text "Median" , Text "75%" , Text "Max" , Text "StdDev" , Text "IQR" , Text "Skewness" ] ) ] ) where columnStats :: Text -> DataFrame -> DataFrame columnStats Text name DataFrame d = if (Maybe Double -> Bool) -> [Maybe Double] -> Bool forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool all Maybe Double -> Bool forall a. Maybe a -> Bool isJust (Text -> [Maybe Double] stats Text name) then Text -> Vector Double -> DataFrame -> DataFrame forall a. (Columnable a, Unbox a) => Text -> Vector a -> DataFrame -> DataFrame insertUnboxedVector Text name ([Double] -> Vector Double forall a. Unbox a => [a] -> Vector a VU.fromList ((Maybe Double -> Double) -> [Maybe Double] -> [Double] forall a b. (a -> b) -> [a] -> [b] map (Int -> Double -> Double roundTo Int 2 (Double -> Double) -> (Maybe Double -> Double) -> Maybe Double -> Double forall b c a. (b -> c) -> (a -> b) -> a -> c . Double -> Maybe Double -> Double forall a. a -> Maybe a -> a fromMaybe Double 0) ([Maybe Double] -> [Double]) -> [Maybe Double] -> [Double] forall a b. (a -> b) -> a -> b $ Text -> [Maybe Double] stats Text name)) DataFrame d else DataFrame d stats :: Text -> [Maybe Double] stats Text name = let count :: Maybe Double count = Int -> Double forall a b. (Integral a, Num b) => a -> b fromIntegral (Int -> Double) -> (Column -> Int) -> Column -> Double forall b c a. (b -> c) -> (a -> b) -> a -> c . Column -> Int numElements (Column -> Double) -> Maybe Column -> Maybe Double forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b <$> Text -> DataFrame -> Maybe Column getColumn Text name DataFrame df quantiles :: Maybe (Vector Double) quantiles = (Vector Double -> Vector Double) -> Text -> DataFrame -> Maybe (Vector Double) applyStatistics (Vector Int -> Int -> Vector Double -> Vector Double forall a. (Unbox a, Num a, Real a) => Vector Int -> Int -> Vector a -> Vector Double quantiles' ([Int] -> Vector Int forall a. Unbox a => [a] -> Vector a VU.fromList [Int 0, Int 1, Int 2, Int 3, Int 4]) Int 4) Text name DataFrame df min' :: Maybe Double min' = (Vector Double -> Int -> Double) -> Int -> Vector Double -> Double forall a b c. (a -> b -> c) -> b -> a -> c flip Vector Double -> Int -> Double forall (v :: * -> *) a. (HasCallStack, Vector v a) => v a -> Int -> a (VG.!) Int 0 (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b <$> Maybe (Vector Double) quantiles quartile1 :: Maybe Double quartile1 = (Vector Double -> Int -> Double) -> Int -> Vector Double -> Double forall a b c. (a -> b -> c) -> b -> a -> c flip Vector Double -> Int -> Double forall (v :: * -> *) a. (HasCallStack, Vector v a) => v a -> Int -> a (VG.!) Int 1 (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b <$> Maybe (Vector Double) quantiles median' :: Maybe Double median' = (Vector Double -> Int -> Double) -> Int -> Vector Double -> Double forall a b c. (a -> b -> c) -> b -> a -> c flip Vector Double -> Int -> Double forall (v :: * -> *) a. (HasCallStack, Vector v a) => v a -> Int -> a (VG.!) Int 2 (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b <$> Maybe (Vector Double) quantiles quartile3 :: Maybe Double quartile3 = (Vector Double -> Int -> Double) -> Int -> Vector Double -> Double forall a b c. (a -> b -> c) -> b -> a -> c flip Vector Double -> Int -> Double forall (v :: * -> *) a. (HasCallStack, Vector v a) => v a -> Int -> a (VG.!) Int 3 (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b <$> Maybe (Vector Double) quantiles max' :: Maybe Double max' = (Vector Double -> Int -> Double) -> Int -> Vector Double -> Double forall a b c. (a -> b -> c) -> b -> a -> c flip Vector Double -> Int -> Double forall (v :: * -> *) a. (HasCallStack, Vector v a) => v a -> Int -> a (VG.!) Int 4 (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b <$> Maybe (Vector Double) quantiles iqr :: Maybe Double iqr = (-) (Double -> Double -> Double) -> Maybe Double -> Maybe (Double -> Double) forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b <$> Maybe Double quartile3 Maybe (Double -> Double) -> Maybe Double -> Maybe Double forall a b. Maybe (a -> b) -> Maybe a -> Maybe b forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b <*> Maybe Double quartile1 doubleColumn :: Text -> Maybe (Vector Double) doubleColumn Text col = Text -> DataFrame -> Maybe (Vector Double) _getColumnAsDouble Text col (Text -> DataFrame -> DataFrame filterJust Text col DataFrame df) in [ Maybe Double count , Vector Double -> Double forall a. (Real a, Unbox a) => Vector a -> Double mean' (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b <$> Text -> Maybe (Vector Double) doubleColumn Text name , Maybe Double min' , Maybe Double quartile1 , Maybe Double median' , Maybe Double quartile3 , Maybe Double max' , Double -> Double forall a. Floating a => a -> a sqrt (Double -> Double) -> (Vector Double -> Double) -> Vector Double -> Double forall b c a. (b -> c) -> (a -> b) -> a -> c . Vector Double -> Double forall a. (Real a, Unbox a) => Vector a -> Double variance' (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b <$> Text -> Maybe (Vector Double) doubleColumn Text name , Maybe Double iqr , Vector Double -> Double forall a. (Unbox a, Real a, Num a) => Vector a -> Double skewness' (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b <$> Text -> Maybe (Vector Double) doubleColumn Text name ] roundTo :: Int -> Double -> Double roundTo :: Int -> Double -> Double roundTo Int n Double x = Integer -> Double forall a. Num a => Integer -> a fromInteger (Double -> Integer forall b. Integral b => Double -> b forall a b. (RealFrac a, Integral b) => a -> b round (Double -> Integer) -> Double -> Integer forall a b. (a -> b) -> a -> b $ Double x Double -> Double -> Double forall a. Num a => a -> a -> a * Double 10 Double -> Int -> Double forall a b. (Num a, Integral b) => a -> b -> a ^ Int n) Double -> Double -> Double forall a. Fractional a => a -> a -> a / Double 10.0 Double -> Int -> Double forall a b. (Fractional a, Integral b) => a -> b -> a ^^ Int n toPct2dp :: Double -> String toPct2dp :: Double -> String toPct2dp Double x | Double x Double -> Double -> Bool forall a. Ord a => a -> a -> Bool < Double 0.00005 = String "<0.01%" | Bool otherwise = String -> Double -> String forall r. PrintfType r => String -> r printf String "%.2f%%" (Double x Double -> Double -> Double forall a. Num a => a -> a -> a * Double 100)