bpo-36546: Add statistics.quantiles() by rhettinger · Pull Request #12710 · python/cpython
Expand Up
@@ -3,6 +3,7 @@
"""
import bisect import collections import collections.abc import copy Expand Down Expand Up @@ -2038,6 +2039,7 @@ def test_compare_to_variance(self): expected = math.sqrt(statistics.variance(data)) self.assertEqual(self.func(data), expected)
class TestGeometricMean(unittest.TestCase):
def test_basics(self): Expand Down Expand Up @@ -2126,6 +2128,146 @@ def test_special_values(self): with self.assertRaises(ValueError): geometric_mean([Inf, -Inf])
class TestQuantiles(unittest.TestCase):
def test_specific_cases(self): # Match results computed by hand and cross-checked # against the PERCENTILE.EXC function in MS Excel. quantiles = statistics.quantiles data = [120, 200, 250, 320, 350] random.shuffle(data) for n, expected in [ (1, []), (2, [250.0]), (3, [200.0, 320.0]), (4, [160.0, 250.0, 335.0]), (5, [136.0, 220.0, 292.0, 344.0]), (6, [120.0, 200.0, 250.0, 320.0, 350.0]), (8, [100.0, 160.0, 212.5, 250.0, 302.5, 335.0, 357.5]), (10, [88.0, 136.0, 184.0, 220.0, 250.0, 292.0, 326.0, 344.0, 362.0]), (12, [80.0, 120.0, 160.0, 200.0, 225.0, 250.0, 285.0, 320.0, 335.0, 350.0, 365.0]), (15, [72.0, 104.0, 136.0, 168.0, 200.0, 220.0, 240.0, 264.0, 292.0, 320.0, 332.0, 344.0, 356.0, 368.0]), ]: self.assertEqual(expected, quantiles(data, n=n)) self.assertEqual(len(quantiles(data, n=n)), n - 1) self.assertEqual(list(map(float, expected)), quantiles(map(Decimal, data), n=n)) self.assertEqual(list(map(Decimal, expected)), quantiles(map(Decimal, data), n=n)) self.assertEqual(list(map(Fraction, expected)), quantiles(map(Fraction, data), n=n)) # Invariant under tranlation and scaling def f(x): return 3.5 * x - 1234.675 exp = list(map(f, expected)) act = quantiles(map(f, data), n=n) self.assertTrue(all(math.isclose(e, a) for e, a in zip(exp, act))) # Quartiles of a standard normal distribution for n, expected in [ (1, []), (2, [0.0]), (3, [-0.4307, 0.4307]), (4 ,[-0.6745, 0.0, 0.6745]), ]: actual = quantiles(statistics.NormalDist(), n=n) self.assertTrue(all(math.isclose(e, a, abs_tol=0.0001) for e, a in zip(expected, actual)))
def test_specific_cases_inclusive(self): # Match results computed by hand and cross-checked # against the PERCENTILE.INC function in MS Excel # and against the quaatile() function in SciPy. quantiles = statistics.quantiles data = [100, 200, 400, 800] random.shuffle(data) for n, expected in [ (1, []), (2, [300.0]), (3, [200.0, 400.0]), (4, [175.0, 300.0, 500.0]), (5, [160.0, 240.0, 360.0, 560.0]), (6, [150.0, 200.0, 300.0, 400.0, 600.0]), (8, [137.5, 175, 225.0, 300.0, 375.0, 500.0,650.0]), (10, [130.0, 160.0, 190.0, 240.0, 300.0, 360.0, 440.0, 560.0, 680.0]), (12, [125.0, 150.0, 175.0, 200.0, 250.0, 300.0, 350.0, 400.0, 500.0, 600.0, 700.0]), (15, [120.0, 140.0, 160.0, 180.0, 200.0, 240.0, 280.0, 320.0, 360.0, 400.0, 480.0, 560.0, 640.0, 720.0]), ]: self.assertEqual(expected, quantiles(data, n=n, method="inclusive")) self.assertEqual(len(quantiles(data, n=n, method="inclusive")), n - 1) self.assertEqual(list(map(float, expected)), quantiles(map(Decimal, data), n=n, method="inclusive")) self.assertEqual(list(map(Decimal, expected)), quantiles(map(Decimal, data), n=n, method="inclusive")) self.assertEqual(list(map(Fraction, expected)), quantiles(map(Fraction, data), n=n, method="inclusive")) # Invariant under tranlation and scaling def f(x): return 3.5 * x - 1234.675 exp = list(map(f, expected)) act = quantiles(map(f, data), n=n, method="inclusive") self.assertTrue(all(math.isclose(e, a) for e, a in zip(exp, act))) # Quartiles of a standard normal distribution for n, expected in [ (1, []), (2, [0.0]), (3, [-0.4307, 0.4307]), (4 ,[-0.6745, 0.0, 0.6745]), ]: actual = quantiles(statistics.NormalDist(), n=n, method="inclusive") self.assertTrue(all(math.isclose(e, a, abs_tol=0.0001) for e, a in zip(expected, actual)))
def test_equal_sized_groups(self): quantiles = statistics.quantiles total = 10_000 data = [random.expovariate(0.2) for i in range(total)] while len(set(data)) != total: data.append(random.expovariate(0.2)) data.sort()
# Cases where the group size exactly divides the total for n in (1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000): group_size = total // n self.assertEqual( [bisect.bisect(data, q) for q in quantiles(data, n=n)], list(range(group_size, total, group_size)))
# When the group sizes can't be exactly equal, they should # differ by no more than one for n in (13, 19, 59, 109, 211, 571, 1019, 1907, 5261, 9769): group_sizes = {total // n, total // n + 1} pos = [bisect.bisect(data, q) for q in quantiles(data, n=n)] sizes = {q - p for p, q in zip(pos, pos[1:])} self.assertTrue(sizes <= group_sizes)
def test_error_cases(self): quantiles = statistics.quantiles StatisticsError = statistics.StatisticsError with self.assertRaises(TypeError): quantiles() # Missing arguments with self.assertRaises(TypeError): quantiles([10, 20, 30], 13, n=4) # Too many arguments with self.assertRaises(TypeError): quantiles([10, 20, 30], 4) # n is a positional argument with self.assertRaises(StatisticsError): quantiles([10, 20, 30], n=0) # n is zero with self.assertRaises(StatisticsError): quantiles([10, 20, 30], n=-1) # n is negative with self.assertRaises(TypeError): quantiles([10, 20, 30], n=1.5) # n is not an integer with self.assertRaises(ValueError): quantiles([10, 20, 30], method='X') # method is unknown with self.assertRaises(StatisticsError): quantiles([10], n=4) # not enough data points with self.assertRaises(TypeError): quantiles([10, None, 30], n=4) # data is non-numeric
class TestNormalDist(unittest.TestCase):
# General note on precision: The pdf(), cdf(), and overlap() methods Expand Down
"""
import bisect import collections import collections.abc import copy Expand Down Expand Up @@ -2038,6 +2039,7 @@ def test_compare_to_variance(self): expected = math.sqrt(statistics.variance(data)) self.assertEqual(self.func(data), expected)
class TestGeometricMean(unittest.TestCase):
def test_basics(self): Expand Down Expand Up @@ -2126,6 +2128,146 @@ def test_special_values(self): with self.assertRaises(ValueError): geometric_mean([Inf, -Inf])
class TestQuantiles(unittest.TestCase):
def test_specific_cases(self): # Match results computed by hand and cross-checked # against the PERCENTILE.EXC function in MS Excel. quantiles = statistics.quantiles data = [120, 200, 250, 320, 350] random.shuffle(data) for n, expected in [ (1, []), (2, [250.0]), (3, [200.0, 320.0]), (4, [160.0, 250.0, 335.0]), (5, [136.0, 220.0, 292.0, 344.0]), (6, [120.0, 200.0, 250.0, 320.0, 350.0]), (8, [100.0, 160.0, 212.5, 250.0, 302.5, 335.0, 357.5]), (10, [88.0, 136.0, 184.0, 220.0, 250.0, 292.0, 326.0, 344.0, 362.0]), (12, [80.0, 120.0, 160.0, 200.0, 225.0, 250.0, 285.0, 320.0, 335.0, 350.0, 365.0]), (15, [72.0, 104.0, 136.0, 168.0, 200.0, 220.0, 240.0, 264.0, 292.0, 320.0, 332.0, 344.0, 356.0, 368.0]), ]: self.assertEqual(expected, quantiles(data, n=n)) self.assertEqual(len(quantiles(data, n=n)), n - 1) self.assertEqual(list(map(float, expected)), quantiles(map(Decimal, data), n=n)) self.assertEqual(list(map(Decimal, expected)), quantiles(map(Decimal, data), n=n)) self.assertEqual(list(map(Fraction, expected)), quantiles(map(Fraction, data), n=n)) # Invariant under tranlation and scaling def f(x): return 3.5 * x - 1234.675 exp = list(map(f, expected)) act = quantiles(map(f, data), n=n) self.assertTrue(all(math.isclose(e, a) for e, a in zip(exp, act))) # Quartiles of a standard normal distribution for n, expected in [ (1, []), (2, [0.0]), (3, [-0.4307, 0.4307]), (4 ,[-0.6745, 0.0, 0.6745]), ]: actual = quantiles(statistics.NormalDist(), n=n) self.assertTrue(all(math.isclose(e, a, abs_tol=0.0001) for e, a in zip(expected, actual)))
def test_specific_cases_inclusive(self): # Match results computed by hand and cross-checked # against the PERCENTILE.INC function in MS Excel # and against the quaatile() function in SciPy. quantiles = statistics.quantiles data = [100, 200, 400, 800] random.shuffle(data) for n, expected in [ (1, []), (2, [300.0]), (3, [200.0, 400.0]), (4, [175.0, 300.0, 500.0]), (5, [160.0, 240.0, 360.0, 560.0]), (6, [150.0, 200.0, 300.0, 400.0, 600.0]), (8, [137.5, 175, 225.0, 300.0, 375.0, 500.0,650.0]), (10, [130.0, 160.0, 190.0, 240.0, 300.0, 360.0, 440.0, 560.0, 680.0]), (12, [125.0, 150.0, 175.0, 200.0, 250.0, 300.0, 350.0, 400.0, 500.0, 600.0, 700.0]), (15, [120.0, 140.0, 160.0, 180.0, 200.0, 240.0, 280.0, 320.0, 360.0, 400.0, 480.0, 560.0, 640.0, 720.0]), ]: self.assertEqual(expected, quantiles(data, n=n, method="inclusive")) self.assertEqual(len(quantiles(data, n=n, method="inclusive")), n - 1) self.assertEqual(list(map(float, expected)), quantiles(map(Decimal, data), n=n, method="inclusive")) self.assertEqual(list(map(Decimal, expected)), quantiles(map(Decimal, data), n=n, method="inclusive")) self.assertEqual(list(map(Fraction, expected)), quantiles(map(Fraction, data), n=n, method="inclusive")) # Invariant under tranlation and scaling def f(x): return 3.5 * x - 1234.675 exp = list(map(f, expected)) act = quantiles(map(f, data), n=n, method="inclusive") self.assertTrue(all(math.isclose(e, a) for e, a in zip(exp, act))) # Quartiles of a standard normal distribution for n, expected in [ (1, []), (2, [0.0]), (3, [-0.4307, 0.4307]), (4 ,[-0.6745, 0.0, 0.6745]), ]: actual = quantiles(statistics.NormalDist(), n=n, method="inclusive") self.assertTrue(all(math.isclose(e, a, abs_tol=0.0001) for e, a in zip(expected, actual)))
def test_equal_sized_groups(self): quantiles = statistics.quantiles total = 10_000 data = [random.expovariate(0.2) for i in range(total)] while len(set(data)) != total: data.append(random.expovariate(0.2)) data.sort()
# Cases where the group size exactly divides the total for n in (1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000, 10000): group_size = total // n self.assertEqual( [bisect.bisect(data, q) for q in quantiles(data, n=n)], list(range(group_size, total, group_size)))
# When the group sizes can't be exactly equal, they should # differ by no more than one for n in (13, 19, 59, 109, 211, 571, 1019, 1907, 5261, 9769): group_sizes = {total // n, total // n + 1} pos = [bisect.bisect(data, q) for q in quantiles(data, n=n)] sizes = {q - p for p, q in zip(pos, pos[1:])} self.assertTrue(sizes <= group_sizes)
def test_error_cases(self): quantiles = statistics.quantiles StatisticsError = statistics.StatisticsError with self.assertRaises(TypeError): quantiles() # Missing arguments with self.assertRaises(TypeError): quantiles([10, 20, 30], 13, n=4) # Too many arguments with self.assertRaises(TypeError): quantiles([10, 20, 30], 4) # n is a positional argument with self.assertRaises(StatisticsError): quantiles([10, 20, 30], n=0) # n is zero with self.assertRaises(StatisticsError): quantiles([10, 20, 30], n=-1) # n is negative with self.assertRaises(TypeError): quantiles([10, 20, 30], n=1.5) # n is not an integer with self.assertRaises(ValueError): quantiles([10, 20, 30], method='X') # method is unknown with self.assertRaises(StatisticsError): quantiles([10], n=4) # not enough data points with self.assertRaises(TypeError): quantiles([10, None, 30], n=4) # data is non-numeric
class TestNormalDist(unittest.TestCase):
# General note on precision: The pdf(), cdf(), and overlap() methods Expand Down