Python, MATLAB, Julia, R code: Chapter 6

Intro to Probability for Data Science

Chapters

Lecture Videos + Slides

Selected Exercise Videos

Code and Data

Problem Solutions (Instructor Only)

Purdue Courses

Acknowledgement: The Julia code is written by the contributors listed here.

Acknowledgement: The R code is written by contributors listed here.

Chapter 6.2 Probability Inequality

Compare Chebyshev's and Chernoff's bounds

% MATLAB code to compare the probability bounds
epsilon = 0.1;
sigma   = 1;
N       = logspace(1,3.9,50);
p_exact = 1-normcdf(sqrt(N)*epsilon/sigma);
p_cheby = sigma^2./(epsilon^2*N);
p_chern = exp(-epsilon^2*N/(2*sigma^2));

loglog(N, p_exact, '-o', 'Color', [1 0.5 0], 'LineWidth', 2); hold on;
loglog(N, p_cheby, '-', 'Color', [0.2 0.7 0.1], 'LineWidth', 2);
loglog(N, p_chern, '-', 'Color', [0.2 0.0 0.8], 'LineWidth', 2);

using Distributions, Plots

epsilon = 0.1
sigma   = 1

p_exact(N) = 1 - cdf(Normal(), sqrt(N) * epsilon / sigma)
p_cheby(N) = sigma^2 / (epsilon^2 * N);
p_chern(N) = exp(-epsilon^2 * N / (2*sigma^2));

ex_args    = (linewidth=2, color=RGB(1.0, 0.5, 0.0), label="Exact", markershape=:circle)
chb_args   = (linewidth=2, color=RGB(0.2, 0.7, 0.1), label="Chebyshev", linestyle=:dashdot)
chern_args = (linewidth=2, color=RGB(0.2, 0.0, 0.8), label="Chernoff")

Ns = [10^i for i in range(1, 3.8, length=50)]
scatter(Ns, p_exact.(Ns);  ex_args...,
        xaxis=:log10,
        yaxis=:log10, yticks=[1e-15, 1e-10, 1e-5, 1e0],
        legend=:bottomleft)
plot!(p_cheby; chb_args...)
plot!(p_chern; chern_args...)


library(pracma)

epsilon <- 0.1
sigma <- 1;
N <- logspace(1,3.9,50)
p_exact <- 1-pnorm(sqrt(N)*epsilon/sigma)
p_cheby <- sigma^2 / (epsilon^2*N)
p_chern <- exp(-epsilon^2*N/(2*sigma*2))

plot(log(N), log(p_exact), pch=1, col="orange", lwd=4, xlab="log(N)", ylab="log(Probability)")
lines(log(N), log(p_cheby), lty=6, col="green", lwd=4)
lines(log(N), log(p_chern), pch=19, col="blue", lwd=4)
legend(3, -25, c("Exact","Chebyshev","Chernoff"), fill=c("orange", "green", "blue"))

Chapter 6.3 Law of Large Numbers

Weak law of large numbers

% MATLAB code to illustrate the weak law of large numbers
Nset = round(logspace(2,5,100));
for i=1:length(Nset)
    N = Nset(i);
    p = 0.5;
    x(:,i) = binornd(N, p, 1000,1)/N;
end
y = x(1:10:end,:)';
semilogx(Nset, y, 'kx'); hold on;
semilogx(Nset, p+3*sqrt(p*(1-p)./Nset), 'r', 'LineWidth', 4);
semilogx(Nset, p-3*sqrt(p*(1-p)./Nset), 'r', 'LineWidth', 4);

import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy.matlib
p = 0.5
Nset = np.round(np.logspace(2,5,100)).astype(int)
x = np.zeros((1000,Nset.size))
for i in range(Nset.size):
  N = Nset[i]
  x[:,i] = stats.binom.rvs(N, p, size=1000)/N
Nset_grid = np.matlib.repmat(Nset, 1000, 1)
plt.semilogx(Nset_grid, x,'ko');
plt.semilogx(Nset, p + 3*np.sqrt((p*(1-p))/Nset), 'r', linewidth=6)
plt.semilogx(Nset, p - 3*np.sqrt((p*(1-p))/Nset), 'r', linewidth=6)

using Distributions, Plots

p = 0.5
Nset = round.((10).^range(2,5,length=100))
x = zeros(length(Nset), 1000)
for (i,N) in enumerate(Nset)
    x[i,:] = rand(Binomial(N, p), 1000) / N
end
y = x[:, 1:10:end]
scatter(Nset, y; xaxis=:log10, xticks=[1e2, 1e3, 1e4, 1e5],
        markershape=:x, color=:black,
        ylabel="sample average",
        legend=false)
plot!(Nset, p .+ 3*sqrt.(p*(1-p) ./ Nset); color=:red, linewidth=4)
plot!(Nset, p .- 3*sqrt.(p*(1-p) ./ Nset); color=:red, linewidth=4)


library(pracma)

p <- 0.5
Nset <- as.integer(round(logspace(2,5,100)))
x <- matrix(rep(0, 1000*length(Nset)), nrow=1000)
for (i in 1:length(Nset)) {
  N = Nset[i]
  x[,i] <- rbinom(1000, N, p) / N
}
Nset_grid <- repmat(Nset, m=1, n=1000)

semilogx(Nset_grid, x, col='black', pch=19)
points(Nset, p + 3*(((p*(1-p))/Nset)^(1/2)), col='red', pch=19, lwd=1)
points(Nset, p - 3*(((p*(1-p))/Nset)^(1/2)), col='red', pch=19, lwd=1)

Chapter 6.4 Central Limit Theorem

PDF of the sum of two Gaussians

% MATLAB: Plot the PDF of the sum of two Gaussians
figure;
n = 10000;
K = 2;
Z = zeros(1,n);
for i=1:K
    X = randi(6,1,n);
    Z = Z + X;
end
m = 3.5*K;
v = sqrt(K*(6^2-1)/12);
histogram(Z,K-0.5:6*K+0.5,'Normalization',...
    'probability','FaceColor',[0 0.5 0.8],'LineWidth',2);
set(gcf, 'Position', [100, 100, 600, 400]);
set(gca,'FontWeight','bold','fontsize',14);

using Distributions, Plots

n = 10000;
K = 2 
X = DiscreteUniform(1, 6)
mu = mean(X)   
sigma = std(X)    
Z = sum(rand(X, n) for _ in 1:K)
mk = K * mu
sk = sqrt(K) * sigma
bin_range = (floor(mk - 3sk) - 1/2):(ceil(mk + 3sk) + 1/2)
plot_args = (normalize=true, color=RGB(0, 0.5, 0.8), linewidth=2, legend=false,
             bins=bin_range)

histogram(Z; plot_args..., title="⚁"^K)


library(pracma)

n <- 10000
K <- 2
Z <- rep(0, n)

for (i in 1:K) {
  X <- runif(n, min=1, max=6)
  Z <- Z + X
}
hist(Z,breaks=(K-0.5):(6*K+0.5),freq=FALSE)

Visualize convergence in distribution

% MATLAB: Visualize convergence in distribution
N = 10; % N = 50;
x = linspace(0,N,1001);
p = 0.5;
p_b = binopdf(x, N, p);
p_n = normpdf(x, N*p, sqrt(N*p*(1-p)));

c_b = binocdf(x, N, p);
c_n = normcdf(x, N*p, sqrt(N*p*(1-p)));

figure;
plot(x,p_b,'LineWidth',2,'Color',[0,0,0]); hold on;
plot(x,p_n,'LineWidth',6,'Color',[0.8,0,0]); hold off;
legend('Binomial', 'Gaussian', 'Location', 'Best');
grid on;
set(gcf, 'Position', [100, 100, 600, 400]);
set(gca,'FontWeight','bold','fontsize',14);

using Distributions, Plots
N, p = 10, 0.5

B = Binomial(N,p)
Z = Normal(mean(B), std(B)) 

p_b(x) = pdf(B, x)
p_n(x) = pdf(Z, x)

c_b(x) = cdf(B, x)
c_n(x) = cdf(Z, x)

bin_args = (color = RGB(0, 0, 0), linewidth=2, label="Binomial")
norm_args = (color = RGB(0.8, 0, 0), linewidth=6, label="Normal")

ns = 0:N
p1 = plot(ns, p_b.(ns); bin_args..., seriestype=:sticks)
plot!(p_n, 0, N; norm_args...)

p2 = plot(c_b, 0, N; bin_args...)
plot!(c_n; norm_args...)

l = @layout [a b]
plot(p1, p2, layout=l)


library(pracma)

N <- 10
N <- 50
x <- linspace(0, N, 1001)
p <- 0.5
p_b <- dbinom(x, N, p)
p_n <- dnorm(x, N*p, (N*p*(1-p))**(1/2))

c_b <- pbinom(x, N, p)
c_n <- pnorm(x, N*p, (N*p*(1-p))**(1/2))

plot(x, p_n, lwd=1, col='red')
lines(x, p_b, lwd=2, col='black')
legend("topright", c('Binomial', 'Gaussian'), fill=c('black', 'red'))

Poisson to Gaussian: convergence in distribution

% MATLAB: Poisson to Gaussian: convergence in distribution
N = 4; % N = 10; % N = 50;
x = linspace(0,2*N,1001);
lambda = 1;
p_b = poisspdf(x, N*lambda);
p_n = normpdf(x, N*lambda, sqrt(N*lambda));

c_b = poisscdf(x, N*lambda);
c_n = normcdf(x, N*lambda, sqrt(N*lambda));

figure;
plot(x,p_b,'LineWidth',2,'Color',[0,0,0]); hold on;
plot(x,p_n,'LineWidth',6,'Color',[0.8,0,0]); hold off;
legend('Poisson', 'Gaussian', 'Location', 'NE');
grid on;
set(gcf, 'Position', [100, 100, 600, 400]);
set(gca,'FontWeight','bold','fontsize',18);

figure;
plot(x,c_b,'LineWidth',2,'Color',[0,0,0]); hold on;
plot(x,c_n,'LineWidth',6,'Color',[0.8,0,0]); hold off;
legend('Poisson', 'Gaussian', 'Location', 'SE');
grid on;
set(gcf, 'Position', [100, 100, 600, 400]);
set(gca,'FontWeight','bold','fontsize',18);

using Distributions, Plots

N = 4; 
lambda = 1

P = Poisson(N*lambda)
Z = Normal(mean(P), std(P))  

p_b(x) = pdf(P, x) 
p_n(x) = pdf(Z, x)

c_b(x) = cdf(P, x)
c_n(x) = cdf(Z, x)

bin_args = (linewidth=2, color=RGB(0.0, 0.0, 0.0), label="Poisson")
norm_args = (linewidth=6, colr=RGB(0.8, 0.0, 0.0), label="Gaussian")

ns = 0:2N
p1 = plot(ns, p_b.(ns); bin_args..., seriestype=:sticks, legend=:topright)
plot!(p_n, 0, 2N; norm_args...)

p2 = plot(c_b, 0, 2N; bin_args..., legend=:bottomright)
plot!(c_n; norm_args...)

l = @layout [a b]
plot(p1, p2, layout=l)


library(pracma)

N <- 4



x <- linspace(0,2*N,1001)
lambda <- 1
p_b <- dpois(x, N*lambda)
p_n <- dnorm(x, N*lambda, sqrt(N*lambda))

c_b <- ppois(x, N*lambda);
c_n = pnorm(x, N*lambda, sqrt(N*lambda));

plot(x, p_n, col="red")
lines(x, p_b, col="black")
legend("topright", c('Poisson', 'Gaussian'), fill=c('black', 'red'))

plot(x, c_n, col="red")
lines(x, c_b, col="black")
legend("topright", c('Poisson', 'Gaussian'), fill=c('black', 'red'))

Visualize the Central Limit Theorem

% MATLAB: Visualize the Central Limit Theorem
N = 10;
x = linspace(0,N,1001);
p = 0.5;
p_b = binopdf(x, N, p);
p_n = normpdf(x, N*p, sqrt(N*p*(1-p)));

c_b = binocdf(x, N, p);
c_n = normcdf(x, N*p, sqrt(N*p*(1-p)));

x2 = linspace(5-2.5,5+2.5,1001);
q2 = normpdf(x2,N*p, sqrt(N*p*(1-p)));

figure;
area(x2,q2,'EdgeColor','none','FaceColor',[0.6,0.9,1]); hold on;
plot(x,p_b,'LineWidth',2,'Color',[0,0,0]);
grid on;
set(gcf, 'Position', [100, 100, 600, 400]);
set(gca,'FontWeight','bold','fontsize',14);

figure;
area(x2,q2,'EdgeColor','none','FaceColor',[0.6,0.9,1]); hold on;
plot(x,p_n,'LineWidth',6,'Color',[0.8,0,0]);
grid on;
set(gcf, 'Position', [100, 100, 600, 400]);
set(gca,'FontWeight','bold','fontsize',14);

using Distributions, Plots

N = 10
x = range(0, N, length=1001)
p = 0.5


B = Binomial(N,p)
Z = Normal(mean(B), std(B))  

p_b(x) = pdf(B, x)
p_n(x) = pdf(Z, x)

c_b(x) = cdf(B, x)
c_n(x) = cdf(Z, x)


xs = range(5-2.5, 5 + 2.5, length=1001)

p1 = plot(p_n, 5-2.5, 5+2.5; fillrange=0 .* xs, color=RGB(0.6, 0.9, 1.0), legend=false,
     title="Binomial PDF")
plot!(0:N, p_b.(0:N); color=RGB(0,0,0), linewidth=2, seriestype=:sticks, layout=1)


p2 = plot(p_n, 5-2.5, 5+2.5, fillrange=0 .* xs, color=RGB(0.6, 0.9, 1.0), legend=false,
     title = "Gaussian PDF")
plot!(p_n, 0, N; color=RGB(0.8, 0, 0), linewidth=6, layout=2)

l = @layout [a b]
plot(p1, p2, layout=l)


library(pracma)

N <- 10
x <- linspace(0,N,1001)
p <- 0.5
p_b <- dbinom(x, N, p);
p_n <- dnorm(x, N*p, sqrt(N*p*(1-p)));

c_b <- pbinom(x, N, p);
c_n <- pnorm(x, N*p, sqrt(N*p*(1-p)));

x2 <- linspace(5-2.5,5+2.5,1001);
q2 <- dnorm(x2,N*p, sqrt(N*p*(1-p)));

plot(x, p_n, col="red")
points(x, p_b, col="black", pch=19)

polygon(c(min(x2), x2, max(x2)), c(0, q2, 0), col='lightblue')

How moment generating of Gaussian approximates in CLT

% MATLAB: Central Limit Theorem from moment generating functions
p = 0.5;
s = linspace(-10,10,1001);
MX = 1-p+p*exp(s);
N = 2;
semilogy(s, (1-p+p*exp(s/N)).^N, 'LineWidth',8, 'Color',[0.1,0.6,1]); hold on;
mu = p;
sigma = sqrt(p*(1-p)/N);
MZ = exp(mu*s + sigma^2*s.^2/2);
semilogy(s, MZ,':','LineWidth', 8, 'Color',[0,0,0]);
grid on;
axis([-10, 10 1e-2 1e5]);
legend('Binomial MGF', 'Gaussian MGF','Location','NW');
yticks([1e-2 1e-1 1 1e1 1e2 1e3 1e4 1e5]);
set(gcf, 'Position', [100, 100, 600, 400]);
set(gca,'FontWeight','bold','fontsize',24);

using Distributions, Plots

N = 2
p = 0.5;

B = Bernoulli(p)
mu, sigma = mean(B), std(B)/sqrt(N) 
Z = Normal(mu, sigma)

m_Bernoulli(s) = mgf(B,s)    
m_B(s) = m_Bernoulli(s/N)^N  
m_Z(s) = mgf(Z,s)            

bin_args = (linewidth=8, color=RGB(0.1, 0.6, 1),
            yaxis=:log, yticks = [10.0^i for i in -1:4],
            label="Binomial MGF")
norm_args = (linewidth=8, color=RGB(0, 0, 0), yaxis=:log, linestyle=:dot, label="Gaussian MGF")

plot(m_B, -10, 10; bin_args..., legend=:topleft)
plot!(m_Z; norm_args...)


library(pracma)

p <- 0.5
s <- linspace(-10,10,1001)
MX <- 1-p+p*exp(s)
N <- 2
semilogy(s, (1-p+p*exp(s/N))**N, lwd=4, col="lightblue", xlim=c(-10,10), ylim=c(10**-2, 10**5))
mu <- p
sigma <- sqrt(p*(1-p)/N);
MZ <- exp(mu*s + sigma^2*s**2/2);
lines(s, MZ, lwd=4);
legend("topleft", c('Binomial MGF', 'Gaussian MGF'), fill=c('lightblue', 'black'))

Failure of Central Limit Theorem at tails

% MATLAB: Failure of Central Limit Theorem at tails
x = linspace(-1,5,1001);
lambda = 1;

N = 1;
f1 = (sqrt(N)/lambda)*pdf('gamma',(x+sqrt(N))/(lambda/sqrt(N)),N,lambda);
semilogy(x, f1, 'LineWidth', 4, 'Color', [0.8 0.8 0.8]); hold on;

N = 10;
f2 = (sqrt(N)/lambda)*pdf('gamma',(x+sqrt(N))/(lambda/sqrt(N)),N,lambda);
semilogy(x, f2, 'LineWidth', 4, 'Color', [0.6 0.6 0.6]);

N = 100;
f3 = (sqrt(N)/lambda)*pdf('gamma',(x+sqrt(N))/(lambda/sqrt(N)),N,lambda);
semilogy(x, f3, 'LineWidth', 4, 'Color', [0.4 0.4 0.4]);

N = 1000;
f4 = (sqrt(N)/lambda)*pdf('gamma',(x+sqrt(N))/(lambda/sqrt(N)),N,lambda);
semilogy(x, f4, 'LineWidth', 4, 'Color', [0.2 0.2 0.2]);

g = pdf('norm',x,0,1);
semilogy(x, g, '-.', 'LineWidth', 4, 'Color', [0.9 0.0 0.0]);

grid on;
legend('N = 1', 'N = 10', 'N = 100', 'N = 1000', 'Gaussian', 'Location','SW');
axis([-1 5 min(ylim) max(ylim)]);
set(gcf, 'Position', [100, 100, 600, 400]);
set(gca,'FontWeight','bold','fontsize',18);

using Distributions, Plots


lambda = 1;

function gamma_pdf(N)
    function(x) 
        (sqrt(N)/lambda) * pdf(Gamma(N,lambda), (x+sqrt(N))/(lambda/sqrt(N)))
    end
end
gaussian_pdf(x) = pdf(Normal(), x)


plot(gamma_pdf(1), -1, 5; linewidth=4, color=RGB(0.8, 0.8, 0.8), label="N=1",
     yaxis=:log,
     legend=:bottomleft)
plot!(gamma_pdf(10);   linewidth=4, color=RGB(0.6, 0.6, 0.6), label="N=10")
plot!(gamma_pdf(100);  linewidth=4, color=RGB(0.4, 0.4, 0.4), label="N=100")
plot!(gamma_pdf(1000); linewidth=4, color=RGB(0.2, 0.2, 0.2), label="N=1000")
plot!(gaussian_pdf;    linewidth=4, color=RGB(0.9, 0.0, 0.0), label="Gaussian",
      linestyle=:dash)


library(pracma)

x <- linspace(-1,5,1001)
lambda <- 1

N <- 1
f1 <- (N**(1/2)/lambda)*dgamma((x+sqrt(N))/(lambda/sqrt(N)), N, lambda)
semilogy(x, f1, lwd=1, col='lightgray', xlim=c(-1,5), ylim=c(10**-6, 1))

N <- 10
f1 <- (N**(1/2)/lambda)*dgamma((x+sqrt(N))/(lambda/sqrt(N)), N, lambda)
lines(x, f1, lwd=2, col='gray')

N <- 100
f1 <- (N**(1/2)/lambda)*dgamma((x+sqrt(N))/(lambda/sqrt(N)), N, lambda)
lines(x, f1, lwd=2, col='darkgray')

N <- 1000
f1 <- (N**(1/2)/lambda)*dgamma((x+sqrt(N))/(lambda/sqrt(N)), N, lambda)
lines(x, f1, lwd=2, col='black')

g <- dnorm(x,0,1)
lines(x, g, lwd=2, pch=1, col='red')

legend("bottomleft", c('N=1', 'N=10', 'N=100', 'N=1000', 'Gaussian'), fill=c('lightgray', 'gray', 'darkgray', 'black', 'red'))