{-# LANGUAGE ExplicitNamespaces #-}
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE RankNTypes #-}
{-# LANGUAGE ScopedTypeVariables #-}
{-# LANGUAGE TypeApplications #-}
{-# LANGUAGE UndecidableInstances #-}

module DataFrame.Operations.Statistics where

import qualified Data.List as L
import qualified Data.Map as M
import qualified Data.Text as T
import qualified Data.Vector as V
import qualified Data.Vector.Generic as VG
import qualified Data.Vector.Unboxed as VU

import Prelude as P

import Control.Exception (throw)
import Data.Function ((&))
import Data.Maybe (fromMaybe, isJust)
import Data.Type.Equality (TestEquality (testEquality), type (:~:) (Refl))
import DataFrame.Errors (DataFrameException (..))
import DataFrame.Internal.Column
import DataFrame.Internal.DataFrame (
    DataFrame (..),
    empty,
    getColumn,
 )
import DataFrame.Internal.Expression
import DataFrame.Internal.Interpreter
import DataFrame.Internal.Nullable (BaseType)
import DataFrame.Internal.Row (showValue, toAny)
import DataFrame.Internal.Statistics
import DataFrame.Internal.Types
import DataFrame.Operations.Core
import DataFrame.Operations.Subset (filterJust)
import DataFrame.Operations.Transformations (ImputeOp (..), imputeCore)
import Text.Printf (printf)
import Type.Reflection (typeRep)


frequencies :: forall a. (Columnable a) => Expr a -> DataFrame -> DataFrame
frequencies :: forall a. Columnable a => Expr a -> DataFrame -> DataFrame
frequencies Expr a
expr DataFrame
df =
    let
        counts :: [(a, Int)]
counts = Expr a -> DataFrame -> [(a, Int)]
forall a.
(Ord a, Columnable a) =>
Expr a -> DataFrame -> [(a, Int)]
valueCounts Expr a
expr DataFrame
df
        calculatePercentage :: [(a, a)] -> a -> Any
calculatePercentage [(a, a)]
cs a
k = String -> Any
forall a. Columnable a => a -> Any
toAny (String -> Any) -> String -> Any
forall a b. (a -> b) -> a -> b
$ Double -> String
toPct2dp (a -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral a
k Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ a -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral ([a] -> a
forall a. Num a => [a] -> a
forall (t :: * -> *) a. (Foldable t, Num a) => t a -> a
P.sum ([a] -> a) -> [a] -> a
forall a b. (a -> b) -> a -> b
$ ((a, a) -> a) -> [(a, a)] -> [a]
forall a b. (a -> b) -> [a] -> [b]
map (a, a) -> a
forall a b. (a, b) -> b
snd [(a, a)]
cs))
        initDf :: DataFrame
initDf =
            DataFrame
empty
                DataFrame -> (DataFrame -> DataFrame) -> DataFrame
forall a b. a -> (a -> b) -> b
& Text -> Vector Text -> DataFrame -> DataFrame
forall a.
Columnable a =>
Text -> Vector a -> DataFrame -> DataFrame
insertVector Text
"Statistic" ([Text] -> Vector Text
forall a. [a] -> Vector a
V.fromList [Text
"Count" :: T.Text, Text
"Percentage (%)"])
        freqs :: Vector a -> DataFrame
freqs Vector a
col =
            (DataFrame -> (a, Int) -> DataFrame)
-> DataFrame -> [(a, Int)] -> DataFrame
forall b a. (b -> a -> b) -> b -> [a] -> b
forall (t :: * -> *) b a.
Foldable t =>
(b -> a -> b) -> b -> t a -> b
L.foldl'
                ( \DataFrame
d (a
col, Int
k) ->
                    Text -> Vector Any -> DataFrame -> DataFrame
forall a.
Columnable a =>
Text -> Vector a -> DataFrame -> DataFrame
insertVector
                        (forall a. Columnable a => a -> Text
showValue @a a
col)
                        ([Any] -> Vector Any
forall a. [a] -> Vector a
V.fromList [Int -> Any
forall a. Columnable a => a -> Any
toAny Int
k, [(a, Int)] -> Int -> Any
forall {a} {a} {a}.
(Integral a, Integral a) =>
[(a, a)] -> a -> Any
calculatePercentage [(a, Int)]
counts Int
k])
                        DataFrame
d
                )
                DataFrame
initDf
                [(a, Int)]
counts
     in
        case Expr a -> DataFrame -> Either DataFrameException (Vector a)
forall a.
Columnable a =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsVector Expr a
expr DataFrame
df of
            Left DataFrameException
err -> DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw DataFrameException
err
            Right Vector a
column -> Vector a -> DataFrame
freqs Vector a
column


mean ::
    forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double
mean :: forall a.
(Columnable a, Real a, Unbox a) =>
Expr a -> DataFrame -> Double
mean (Col Text
name) DataFrame
df = case Text -> DataFrame -> Maybe (Vector Double)
_getColumnAsDouble Text
name DataFrame
df of
    Just Vector Double
xs -> Vector Double -> Double
meanDouble' Vector Double
xs
    Maybe (Vector Double)
Nothing -> String -> Double
forall a. HasCallStack => String -> a
error String
"[INTERNAL ERROR] Column is non-numeric"
mean Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a.
(Columnable a, Unbox a) =>
Column -> Either DataFrameException (Vector a)
toUnboxedVector @a Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> Vector a -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
mean' Vector a
xs

meanMaybe ::
    forall a. (Columnable a, Real a) => Expr (Maybe a) -> DataFrame -> Double
meanMaybe :: forall a.
(Columnable a, Real a) =>
Expr (Maybe a) -> DataFrame -> Double
meanMaybe (Col Text
name) DataFrame
df =
    (Vector Double -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
mean' (Vector Double -> Double)
-> (Vector (Maybe a) -> Vector Double)
-> Vector (Maybe a)
-> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector (Maybe a) -> Vector Double
forall a. Real a => Vector (Maybe a) -> Vector Double
optionalToDoubleVector)
        ((DataFrameException -> Vector (Maybe a))
-> (Vector (Maybe a) -> Vector (Maybe a))
-> Either DataFrameException (Vector (Maybe a))
-> Vector (Maybe a)
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either DataFrameException -> Vector (Maybe a)
forall a e. Exception e => e -> a
throw Vector (Maybe a) -> Vector (Maybe a)
forall a. a -> a
id (Expr (Maybe a)
-> DataFrame -> Either DataFrameException (Vector (Maybe a))
forall a.
Columnable a =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsVector (forall a. Columnable a => Text -> Expr a
Col @(Maybe a) Text
name) DataFrame
df))
meanMaybe Expr (Maybe a)
expr DataFrame
df = case forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret @(Maybe a) DataFrame
df Expr (Maybe a)
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a (v :: * -> *).
(Vector v a, Columnable a) =>
Column -> Either DataFrameException (v a)
toVector @(Maybe a) Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector (Maybe a)
xs -> (Vector Double -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
mean' (Vector Double -> Double)
-> (Vector (Maybe a) -> Vector Double)
-> Vector (Maybe a)
-> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector (Maybe a) -> Vector Double
forall a. Real a => Vector (Maybe a) -> Vector Double
optionalToDoubleVector) Vector (Maybe a)
xs


median ::
    forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double
median :: forall a.
(Columnable a, Real a, Unbox a) =>
Expr a -> DataFrame -> Double
median (Col Text
name) DataFrame
df = case Expr a -> DataFrame -> Either DataFrameException (Vector a)
forall a.
(Columnable a, Unbox a) =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsUnboxedVector (forall a. Columnable a => Text -> Expr a
Col @a Text
name) DataFrame
df of
    Right Vector a
xs -> Vector a -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
median' Vector a
xs
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
median Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a.
(Columnable a, Unbox a) =>
Column -> Either DataFrameException (Vector a)
toUnboxedVector @a Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> Vector a -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
median' Vector a
xs


medianMaybe ::
    forall a. (Columnable a, Real a) => Expr (Maybe a) -> DataFrame -> Double
medianMaybe :: forall a.
(Columnable a, Real a) =>
Expr (Maybe a) -> DataFrame -> Double
medianMaybe (Col Text
name) DataFrame
df =
    (Vector Double -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
median' (Vector Double -> Double)
-> (Vector (Maybe a) -> Vector Double)
-> Vector (Maybe a)
-> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector (Maybe a) -> Vector Double
forall a. Real a => Vector (Maybe a) -> Vector Double
optionalToDoubleVector)
        ((DataFrameException -> Vector (Maybe a))
-> (Vector (Maybe a) -> Vector (Maybe a))
-> Either DataFrameException (Vector (Maybe a))
-> Vector (Maybe a)
forall a c b. (a -> c) -> (b -> c) -> Either a b -> c
either DataFrameException -> Vector (Maybe a)
forall a e. Exception e => e -> a
throw Vector (Maybe a) -> Vector (Maybe a)
forall a. a -> a
id (Expr (Maybe a)
-> DataFrame -> Either DataFrameException (Vector (Maybe a))
forall a.
Columnable a =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsVector (forall a. Columnable a => Text -> Expr a
Col @(Maybe a) Text
name) DataFrame
df))
medianMaybe Expr (Maybe a)
expr DataFrame
df = case forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret @(Maybe a) DataFrame
df Expr (Maybe a)
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a (v :: * -> *).
(Vector v a, Columnable a) =>
Column -> Either DataFrameException (v a)
toVector @(Maybe a) Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector (Maybe a)
xs -> (Vector Double -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
median' (Vector Double -> Double)
-> (Vector (Maybe a) -> Vector Double)
-> Vector (Maybe a)
-> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector (Maybe a) -> Vector Double
forall a. Real a => Vector (Maybe a) -> Vector Double
optionalToDoubleVector) Vector (Maybe a)
xs


percentile ::
    forall a.
    (Columnable a, Real a, VU.Unbox a) => Int -> Expr a -> DataFrame -> Double
percentile :: forall a.
(Columnable a, Real a, Unbox a) =>
Int -> Expr a -> DataFrame -> Double
percentile Int
n (Col Text
name) DataFrame
df = case Expr a -> DataFrame -> Either DataFrameException (Vector a)
forall a.
(Columnable a, Unbox a) =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsUnboxedVector (forall a. Columnable a => Text -> Expr a
Col @a Text
name) DataFrame
df of
    Right Vector a
xs -> Int -> Vector a -> Double
forall a. (Unbox a, Num a, Real a) => Int -> Vector a -> Double
percentile' Int
n Vector a
xs
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
percentile Int
n Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a.
(Columnable a, Unbox a) =>
Column -> Either DataFrameException (Vector a)
toUnboxedVector @a Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> Int -> Vector a -> Double
forall a. (Unbox a, Num a, Real a) => Int -> Vector a -> Double
percentile' Int
n Vector a
xs


genericPercentile ::
    forall a.
    (Columnable a, Ord a) => Int -> Expr a -> DataFrame -> a
genericPercentile :: forall a. (Columnable a, Ord a) => Int -> Expr a -> DataFrame -> a
genericPercentile Int
n (Col Text
name) DataFrame
df = case Expr a -> DataFrame -> Either DataFrameException (Vector a)
forall a.
Columnable a =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsVector (forall a. Columnable a => Text -> Expr a
Col @a Text
name) DataFrame
df of
    Right Vector a
xs -> Int -> Vector a -> a
forall a. (Ord a, Eq a) => Int -> Vector a -> a
percentileOrd' Int
n Vector a
xs
    Left DataFrameException
e -> DataFrameException -> a
forall a e. Exception e => e -> a
throw DataFrameException
e
genericPercentile Int
n Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> a
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a (v :: * -> *).
(Vector v a, Columnable a) =>
Column -> Either DataFrameException (v a)
toVector @a Column
col of
        Left DataFrameException
e -> DataFrameException -> a
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> Int -> Vector a -> a
forall a. (Ord a, Eq a) => Int -> Vector a -> a
percentileOrd' Int
n Vector a
xs


standardDeviation ::
    forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double
standardDeviation :: forall a.
(Columnable a, Real a, Unbox a) =>
Expr a -> DataFrame -> Double
standardDeviation (Col Text
name) DataFrame
df = case Expr a -> DataFrame -> Either DataFrameException (Vector a)
forall a.
(Columnable a, Unbox a) =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsUnboxedVector (forall a. Columnable a => Text -> Expr a
Col @a Text
name) DataFrame
df of
    Right Vector a
xs -> (Double -> Double
forall a. Floating a => a -> a
sqrt (Double -> Double) -> (Vector a -> Double) -> Vector a -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector a -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
variance') Vector a
xs
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
standardDeviation Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a.
(Columnable a, Unbox a) =>
Column -> Either DataFrameException (Vector a)
toUnboxedVector @a Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> (Double -> Double
forall a. Floating a => a -> a
sqrt (Double -> Double) -> (Vector a -> Double) -> Vector a -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector a -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
variance') Vector a
xs


skewness ::
    forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double
skewness :: forall a.
(Columnable a, Real a, Unbox a) =>
Expr a -> DataFrame -> Double
skewness (Col Text
name) DataFrame
df = case Expr a -> DataFrame -> Either DataFrameException (Vector a)
forall a.
(Columnable a, Unbox a) =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsUnboxedVector (forall a. Columnable a => Text -> Expr a
Col @a Text
name) DataFrame
df of
    Right Vector a
xs -> Vector a -> Double
forall a. (Unbox a, Real a, Num a) => Vector a -> Double
skewness' Vector a
xs
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
skewness Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a.
(Columnable a, Unbox a) =>
Column -> Either DataFrameException (Vector a)
toUnboxedVector @a Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> Vector a -> Double
forall a. (Unbox a, Real a, Num a) => Vector a -> Double
skewness' Vector a
xs


variance ::
    forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double
variance :: forall a.
(Columnable a, Real a, Unbox a) =>
Expr a -> DataFrame -> Double
variance (Col Text
name) DataFrame
df = case Text -> DataFrame -> Maybe (Vector Double)
_getColumnAsDouble Text
name DataFrame
df of
    Just Vector Double
xs -> Vector Double -> Double
varianceDouble' Vector Double
xs
    Maybe (Vector Double)
Nothing -> String -> Double
forall a. HasCallStack => String -> a
error String
"[INTERNAL ERROR] Column is non-numeric"
variance Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a.
(Columnable a, Unbox a) =>
Column -> Either DataFrameException (Vector a)
toUnboxedVector @a Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> Vector a -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
variance' Vector a
xs


interQuartileRange ::
    forall a. (Columnable a, Real a, VU.Unbox a) => Expr a -> DataFrame -> Double
interQuartileRange :: forall a.
(Columnable a, Real a, Unbox a) =>
Expr a -> DataFrame -> Double
interQuartileRange (Col Text
name) DataFrame
df = case Expr a -> DataFrame -> Either DataFrameException (Vector a)
forall a.
(Columnable a, Unbox a) =>
Expr a -> DataFrame -> Either DataFrameException (Vector a)
columnAsUnboxedVector (forall a. Columnable a => Text -> Expr a
Col @a Text
name) DataFrame
df of
    Right Vector a
xs -> Vector a -> Double
forall a. (Unbox a, Num a, Real a) => Vector a -> Double
interQuartileRange' Vector a
xs
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
interQuartileRange Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
col) -> case forall a.
(Columnable a, Unbox a) =>
Column -> Either DataFrameException (Vector a)
toUnboxedVector @a Column
col of
        Left DataFrameException
e -> DataFrameException -> Double
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> Vector a -> Double
forall a. (Unbox a, Num a, Real a) => Vector a -> Double
interQuartileRange' Vector a
xs


correlation :: T.Text -> T.Text -> DataFrame -> Maybe Double
correlation :: Text -> Text -> DataFrame -> Maybe Double
correlation Text
first Text
second DataFrame
df = do
    Vector Double
f <- Text -> DataFrame -> Maybe (Vector Double)
_getColumnAsDouble Text
first DataFrame
df
    Vector Double
s <- Text -> DataFrame -> Maybe (Vector Double)
_getColumnAsDouble Text
second DataFrame
df
    Vector Double -> Vector Double -> Maybe Double
correlation' Vector Double
f Vector Double
s

_getColumnAsDouble :: T.Text -> DataFrame -> Maybe (VU.Vector Double)
_getColumnAsDouble :: Text -> DataFrame -> Maybe (Vector Double)
_getColumnAsDouble Text
name DataFrame
df = case Text -> DataFrame -> Maybe Column
getColumn Text
name DataFrame
df of
    Just (UnboxedColumn (Vector a
f :: VU.Vector a)) -> case TypeRep a -> TypeRep Double -> Maybe (a :~: Double)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @Double) of
        Just a :~: Double
Refl -> Vector Double -> Maybe (Vector Double)
forall a. a -> Maybe a
Just Vector a
Vector Double
f
        Maybe (a :~: Double)
Nothing -> case forall a. SBoolI (IntegralTypes a) => SBool (IntegralTypes a)
sIntegral @a of
            SBool (IntegralTypes a)
STrue -> Vector Double -> Maybe (Vector Double)
forall a. a -> Maybe a
Just ((a -> Double) -> Vector a -> Vector Double
forall a b. (Unbox a, Unbox b) => (a -> b) -> Vector a -> Vector b
VU.map a -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral Vector a
f)
            SBool (IntegralTypes a)
SFalse -> case forall a. SBoolI (FloatingTypes a) => SBool (FloatingTypes a)
sFloating @a of
                SBool (FloatingTypes a)
STrue -> Vector Double -> Maybe (Vector Double)
forall a. a -> Maybe a
Just ((a -> Double) -> Vector a -> Vector Double
forall a b. (Unbox a, Unbox b) => (a -> b) -> Vector a -> Vector b
VU.map a -> Double
forall a b. (Real a, Fractional b) => a -> b
realToFrac Vector a
f)
                SBool (FloatingTypes a)
SFalse -> Maybe (Vector Double)
forall a. Maybe a
Nothing
    Maybe Column
Nothing ->
        DataFrameException -> Maybe (Vector Double)
forall a e. Exception e => e -> a
throw (DataFrameException -> Maybe (Vector Double))
-> DataFrameException -> Maybe (Vector Double)
forall a b. (a -> b) -> a -> b
$
            Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
name Text
"_getColumnAsDouble" (Map Text Int -> [Text]
forall k a. Map k a -> [k]
M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)
    Maybe Column
_ -> Maybe (Vector Double)
forall a. Maybe a
Nothing 
{-# INLINE _getColumnAsDouble #-}

optionalToDoubleVector :: (Real a) => V.Vector (Maybe a) -> VU.Vector Double
optionalToDoubleVector :: forall a. Real a => Vector (Maybe a) -> Vector Double
optionalToDoubleVector =
    [Double] -> Vector Double
forall a. Unbox a => [a] -> Vector a
VU.fromList
        ([Double] -> Vector Double)
-> (Vector (Maybe a) -> [Double])
-> Vector (Maybe a)
-> Vector Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. ([Double] -> Maybe a -> [Double])
-> [Double] -> Vector (Maybe a) -> [Double]
forall a b. (a -> b -> a) -> a -> Vector b -> a
V.foldl'
            (\[Double]
acc Maybe a
e -> if Maybe a -> Bool
forall a. Maybe a -> Bool
isJust Maybe a
e then a -> Double
forall a b. (Real a, Fractional b) => a -> b
realToFrac (a -> Maybe a -> a
forall a. a -> Maybe a -> a
fromMaybe a
0 Maybe a
e) Double -> [Double] -> [Double]
forall a. a -> [a] -> [a]
: [Double]
acc else [Double]
acc)
            []


sum ::
    forall a. (Columnable a, Num a) => Expr a -> DataFrame -> a
sum :: forall a. (Columnable a, Num a) => Expr a -> DataFrame -> a
sum (Col Text
name) DataFrame
df = case Text -> DataFrame -> Maybe Column
getColumn Text
name DataFrame
df of
    Maybe Column
Nothing -> DataFrameException -> a
forall a e. Exception e => e -> a
throw (DataFrameException -> a) -> DataFrameException -> a
forall a b. (a -> b) -> a -> b
$ Text -> Text -> [Text] -> DataFrameException
ColumnNotFoundException Text
name Text
"sum" (Map Text Int -> [Text]
forall k a. Map k a -> [k]
M.keys (Map Text Int -> [Text]) -> Map Text Int -> [Text]
forall a b. (a -> b) -> a -> b
$ DataFrame -> Map Text Int
columnIndices DataFrame
df)
    Just ((UnboxedColumn (Vector a
column :: VU.Vector a'))) -> case TypeRep a -> TypeRep a -> Maybe (a :~: a)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a') (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) of
        Just a :~: a
Refl -> Vector a -> a
forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a
VG.sum Vector a
Vector a
column
        Maybe (a :~: a)
Nothing -> a
0
    Just ((BoxedColumn (Vector a
column :: V.Vector a'))) -> case TypeRep a -> TypeRep a -> Maybe (a :~: a)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a') (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) of
        Just a :~: a
Refl -> Vector a -> a
forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a
VG.sum Vector a
Vector a
column
        Maybe (a :~: a)
Nothing -> a
0
    Just ((OptionalColumn (Vector (Maybe a)
column :: V.Vector (Maybe a')))) -> case TypeRep a -> TypeRep a -> Maybe (a :~: a)
forall a b. TypeRep a -> TypeRep b -> Maybe (a :~: b)
forall {k} (f :: k -> *) (a :: k) (b :: k).
TestEquality f =>
f a -> f b -> Maybe (a :~: b)
testEquality (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a') (forall a. Typeable a => TypeRep a
forall {k} (a :: k). Typeable a => TypeRep a
typeRep @a) of
        Just a :~: a
Refl -> Vector a -> a
forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a
VG.sum ((Maybe a -> a) -> Vector (Maybe a) -> Vector a
forall (v :: * -> *) a b.
(Vector v a, Vector v b) =>
(a -> b) -> v a -> v b
VG.map (a -> Maybe a -> a
forall a. a -> Maybe a -> a
fromMaybe a
0) Vector (Maybe a)
Vector (Maybe a)
column)
        Maybe (a :~: a)
Nothing -> a
0
sum Expr a
expr DataFrame
df = case DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret DataFrame
df Expr a
expr of
    Left DataFrameException
e -> DataFrameException -> a
forall a e. Exception e => e -> a
throw DataFrameException
e
    Right (TColumn Column
xs) -> case forall a (v :: * -> *).
(Vector v a, Columnable a) =>
Column -> Either DataFrameException (v a)
toVector @a @V.Vector Column
xs of
        Left DataFrameException
e -> DataFrameException -> a
forall a e. Exception e => e -> a
throw DataFrameException
e
        Right Vector a
xs -> Vector a -> a
forall (v :: * -> *) a. (Vector v a, Num a) => v a -> a
VG.sum Vector a
xs


instance {-# OVERLAPPING #-} (Columnable b) => ImputeOp (Maybe b) where
    runImpute :: Expr (Maybe b) -> BaseType (Maybe b) -> DataFrame -> DataFrame
runImpute = Expr (Maybe b) -> b -> DataFrame -> DataFrame
Expr (Maybe b) -> BaseType (Maybe b) -> DataFrame -> DataFrame
forall b.
Columnable b =>
Expr (Maybe b) -> b -> DataFrame -> DataFrame
imputeCore

    runImputeWith :: Columnable (BaseType (Maybe b)) =>
(Expr (BaseType (Maybe b)) -> Expr (BaseType (Maybe b)))
-> Expr (Maybe b) -> DataFrame -> DataFrame
runImputeWith Expr (BaseType (Maybe b)) -> Expr (BaseType (Maybe b))
f col :: Expr (Maybe b)
col@(Col Text
columnName) DataFrame
df =
        case forall a.
Columnable a =>
DataFrame -> Expr a -> Either DataFrameException (TypedColumn a)
interpret @b (Text -> DataFrame -> DataFrame
filterJust Text
columnName DataFrame
df) (Expr (BaseType (Maybe b)) -> Expr (BaseType (Maybe b))
f (forall a. Columnable a => Text -> Expr a
Col @b Text
columnName)) of
            Left DataFrameException
e -> DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw DataFrameException
e
            Right (TColumn Column
value) -> case forall a. Columnable a => Column -> Either DataFrameException a
headColumn @b Column
value of
                Left DataFrameException
e -> DataFrameException -> DataFrame
forall a e. Exception e => e -> a
throw DataFrameException
e
                Right b
h ->
                    if (b -> Bool) -> [b] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all (b -> b -> Bool
forall a. Eq a => a -> a -> Bool
== b
h) (forall a. Columnable a => Column -> [a]
toList @b Column
value)
                        then Expr (Maybe b) -> b -> DataFrame -> DataFrame
forall b.
Columnable b =>
Expr (Maybe b) -> b -> DataFrame -> DataFrame
imputeCore Expr (Maybe b)
col b
h DataFrame
df
                        else String -> DataFrame
forall a. HasCallStack => String -> a
error String
"Impute expression returned more than one value"
    runImputeWith Expr (BaseType (Maybe b)) -> Expr (BaseType (Maybe b))
_ Expr (Maybe b)
_ DataFrame
df = DataFrame
df

imputeWith ::
    forall a.
    (ImputeOp a, Columnable (BaseType a)) =>
    (Expr (BaseType a) -> Expr (BaseType a)) ->
    Expr a ->
    DataFrame ->
    DataFrame
imputeWith :: forall a.
(ImputeOp a, Columnable (BaseType a)) =>
(Expr (BaseType a) -> Expr (BaseType a))
-> Expr a -> DataFrame -> DataFrame
imputeWith = (Expr (BaseType a) -> Expr (BaseType a))
-> Expr a -> DataFrame -> DataFrame
forall a.
(ImputeOp a, Columnable (BaseType a)) =>
(Expr (BaseType a) -> Expr (BaseType a))
-> Expr a -> DataFrame -> DataFrame
runImputeWith

applyStatistic ::
    (VU.Vector Double -> Double) -> T.Text -> DataFrame -> Maybe Double
applyStatistic :: (Vector Double -> Double) -> Text -> DataFrame -> Maybe Double
applyStatistic Vector Double -> Double
f Text
name DataFrame
df = Vector Double -> Maybe Double
apply (Vector Double -> Maybe Double)
-> Maybe (Vector Double) -> Maybe Double
forall (m :: * -> *) a b. Monad m => (a -> m b) -> m a -> m b
=<< Text -> DataFrame -> Maybe (Vector Double)
_getColumnAsDouble Text
name (Text -> DataFrame -> DataFrame
filterJust Text
name DataFrame
df)
  where
    apply :: Vector Double -> Maybe Double
apply Vector Double
col =
        let
            res :: Double
res = Vector Double -> Double
f Vector Double
col
         in
            if Double -> Bool
forall a. RealFloat a => a -> Bool
isNaN Double
res then Maybe Double
forall a. Maybe a
Nothing else Double -> Maybe Double
forall a. a -> Maybe a
forall (f :: * -> *) a. Applicative f => a -> f a
pure Double
res
{-# INLINE applyStatistic #-}

applyStatistics ::
    (VU.Vector Double -> VU.Vector Double) ->
    T.Text ->
    DataFrame ->
    Maybe (VU.Vector Double)
applyStatistics :: (Vector Double -> Vector Double)
-> Text -> DataFrame -> Maybe (Vector Double)
applyStatistics Vector Double -> Vector Double
f Text
name DataFrame
df = (Vector Double -> Vector Double)
-> Maybe (Vector Double) -> Maybe (Vector Double)
forall a b. (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
fmap Vector Double -> Vector Double
f (Text -> DataFrame -> Maybe (Vector Double)
_getColumnAsDouble Text
name (Text -> DataFrame -> DataFrame
filterJust Text
name DataFrame
df))


summarize :: DataFrame -> DataFrame
summarize :: DataFrame -> DataFrame
summarize DataFrame
df =
    (Text -> DataFrame -> DataFrame)
-> [Text] -> DataFrame -> DataFrame
forall a.
(a -> DataFrame -> DataFrame) -> [a] -> DataFrame -> DataFrame
fold
        Text -> DataFrame -> DataFrame
columnStats
        (DataFrame -> [Text]
columnNames DataFrame
df)
        ( [(Text, Column)] -> DataFrame
fromNamedColumns
            [
                ( Text
"Statistic"
                , [Text] -> Column
forall a.
(Columnable a, ColumnifyRep (KindOf a) a) =>
[a] -> Column
fromList
                    [ Text
"Count" :: T.Text
                    , Text
"Mean"
                    , Text
"Minimum"
                    , Text
"25%"
                    , Text
"Median"
                    , Text
"75%"
                    , Text
"Max"
                    , Text
"StdDev"
                    , Text
"IQR"
                    , Text
"Skewness"
                    ]
                )
            ]
        )
  where
    columnStats :: Text -> DataFrame -> DataFrame
columnStats Text
name DataFrame
d =
        if (Maybe Double -> Bool) -> [Maybe Double] -> Bool
forall (t :: * -> *) a. Foldable t => (a -> Bool) -> t a -> Bool
all Maybe Double -> Bool
forall a. Maybe a -> Bool
isJust (Text -> [Maybe Double]
stats Text
name)
            then
                Text -> Vector Double -> DataFrame -> DataFrame
forall a.
(Columnable a, Unbox a) =>
Text -> Vector a -> DataFrame -> DataFrame
insertUnboxedVector
                    Text
name
                    ([Double] -> Vector Double
forall a. Unbox a => [a] -> Vector a
VU.fromList ((Maybe Double -> Double) -> [Maybe Double] -> [Double]
forall a b. (a -> b) -> [a] -> [b]
map (Int -> Double -> Double
roundTo Int
2 (Double -> Double)
-> (Maybe Double -> Double) -> Maybe Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Double -> Maybe Double -> Double
forall a. a -> Maybe a -> a
fromMaybe Double
0) ([Maybe Double] -> [Double]) -> [Maybe Double] -> [Double]
forall a b. (a -> b) -> a -> b
$ Text -> [Maybe Double]
stats Text
name))
                    DataFrame
d
            else DataFrame
d
    stats :: Text -> [Maybe Double]
stats Text
name =
        let
            count :: Maybe Double
count = Int -> Double
forall a b. (Integral a, Num b) => a -> b
fromIntegral (Int -> Double) -> (Column -> Int) -> Column -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Column -> Int
numElements (Column -> Double) -> Maybe Column -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text -> DataFrame -> Maybe Column
getColumn Text
name DataFrame
df
            quantiles :: Maybe (Vector Double)
quantiles = (Vector Double -> Vector Double)
-> Text -> DataFrame -> Maybe (Vector Double)
applyStatistics (Vector Int -> Int -> Vector Double -> Vector Double
forall a.
(Unbox a, Num a, Real a) =>
Vector Int -> Int -> Vector a -> Vector Double
quantiles' ([Int] -> Vector Int
forall a. Unbox a => [a] -> Vector a
VU.fromList [Int
0, Int
1, Int
2, Int
3, Int
4]) Int
4) Text
name DataFrame
df
            min' :: Maybe Double
min' = (Vector Double -> Int -> Double) -> Int -> Vector Double -> Double
forall a b c. (a -> b -> c) -> b -> a -> c
flip Vector Double -> Int -> Double
forall (v :: * -> *) a.
(HasCallStack, Vector v a) =>
v a -> Int -> a
(VG.!) Int
0 (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe (Vector Double)
quantiles
            quartile1 :: Maybe Double
quartile1 = (Vector Double -> Int -> Double) -> Int -> Vector Double -> Double
forall a b c. (a -> b -> c) -> b -> a -> c
flip Vector Double -> Int -> Double
forall (v :: * -> *) a.
(HasCallStack, Vector v a) =>
v a -> Int -> a
(VG.!) Int
1 (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe (Vector Double)
quantiles
            median' :: Maybe Double
median' = (Vector Double -> Int -> Double) -> Int -> Vector Double -> Double
forall a b c. (a -> b -> c) -> b -> a -> c
flip Vector Double -> Int -> Double
forall (v :: * -> *) a.
(HasCallStack, Vector v a) =>
v a -> Int -> a
(VG.!) Int
2 (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe (Vector Double)
quantiles
            quartile3 :: Maybe Double
quartile3 = (Vector Double -> Int -> Double) -> Int -> Vector Double -> Double
forall a b c. (a -> b -> c) -> b -> a -> c
flip Vector Double -> Int -> Double
forall (v :: * -> *) a.
(HasCallStack, Vector v a) =>
v a -> Int -> a
(VG.!) Int
3 (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe (Vector Double)
quantiles
            max' :: Maybe Double
max' = (Vector Double -> Int -> Double) -> Int -> Vector Double -> Double
forall a b c. (a -> b -> c) -> b -> a -> c
flip Vector Double -> Int -> Double
forall (v :: * -> *) a.
(HasCallStack, Vector v a) =>
v a -> Int -> a
(VG.!) Int
4 (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe (Vector Double)
quantiles
            iqr :: Maybe Double
iqr = (-) (Double -> Double -> Double)
-> Maybe Double -> Maybe (Double -> Double)
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Maybe Double
quartile3 Maybe (Double -> Double) -> Maybe Double -> Maybe Double
forall a b. Maybe (a -> b) -> Maybe a -> Maybe b
forall (f :: * -> *) a b. Applicative f => f (a -> b) -> f a -> f b
<*> Maybe Double
quartile1
            doubleColumn :: Text -> Maybe (Vector Double)
doubleColumn Text
col = Text -> DataFrame -> Maybe (Vector Double)
_getColumnAsDouble Text
col (Text -> DataFrame -> DataFrame
filterJust Text
col DataFrame
df)
         in
            [ Maybe Double
count
            , Vector Double -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
mean' (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text -> Maybe (Vector Double)
doubleColumn Text
name
            , Maybe Double
min'
            , Maybe Double
quartile1
            , Maybe Double
median'
            , Maybe Double
quartile3
            , Maybe Double
max'
            , Double -> Double
forall a. Floating a => a -> a
sqrt (Double -> Double)
-> (Vector Double -> Double) -> Vector Double -> Double
forall b c a. (b -> c) -> (a -> b) -> a -> c
. Vector Double -> Double
forall a. (Real a, Unbox a) => Vector a -> Double
variance' (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text -> Maybe (Vector Double)
doubleColumn Text
name
            , Maybe Double
iqr
            , Vector Double -> Double
forall a. (Unbox a, Real a, Num a) => Vector a -> Double
skewness' (Vector Double -> Double) -> Maybe (Vector Double) -> Maybe Double
forall (f :: * -> *) a b. Functor f => (a -> b) -> f a -> f b
<$> Text -> Maybe (Vector Double)
doubleColumn Text
name
            ]


roundTo :: Int -> Double -> Double
roundTo :: Int -> Double -> Double
roundTo Int
n Double
x = Integer -> Double
forall a. Num a => Integer -> a
fromInteger (Double -> Integer
forall b. Integral b => Double -> b
forall a b. (RealFrac a, Integral b) => a -> b
round (Double -> Integer) -> Double -> Integer
forall a b. (a -> b) -> a -> b
$ Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
10 Double -> Int -> Double
forall a b. (Num a, Integral b) => a -> b -> a
^ Int
n) Double -> Double -> Double
forall a. Fractional a => a -> a -> a
/ Double
10.0 Double -> Int -> Double
forall a b. (Fractional a, Integral b) => a -> b -> a
^^ Int
n

toPct2dp :: Double -> String
toPct2dp :: Double -> String
toPct2dp Double
x
    | Double
x Double -> Double -> Bool
forall a. Ord a => a -> a -> Bool
< Double
0.00005 = String
"<0.01%"
    | Bool
otherwise = String -> Double -> String
forall r. PrintfType r => String -> r
printf String
"%.2f%%" (Double
x Double -> Double -> Double
forall a. Num a => a -> a -> a
* Double
100)