#!/usr/bin/env python # coding: utf-8 # In[1]: import sympy as sy import fractions as fr import matplotlib.pyplot as plt import scipy as sp import scipy.special from texttable import Texttable import scipy.stats import scipy.special import numpy as np import pandas as pd from mpl_toolkits.mplot3d import Axes3D plt.style.use("fivethirtyeight") np.set_printoptions(precision=4) np.set_printoptions(suppress=True) # suppress scientific notation # To proceed with inferential statistics, there is no way to circumvent **probability theory** and it's undoubtedly the most important mathematical subject should you ever know in your life. Once probability view deeply implanted in your mind, you would have a revolutionary world view. # # Here we will review the probability theory in a textbook style, i.e. start from defining sets. # # Basic Concepts Refresh # ## Sets # A set is a collection of _distinct elements_, for instance, $(1, 2, 3)$ is a set, but $(2, 2, 3)$ isn't. Here we will demonstrate a bit of **SymPy** library, only scratching the surface. SymPy is an extremely powerful library, there are extensive examples in the **Linear Algebra With Python** training sessions. # # We'll start from defining sets then basic operations. ```sy.``` is alias of ```sympy```. # In[2]: s = sy.FiniteSet(2, 3, 4, 5, 6) s # In[3]: s1 = sy.FiniteSet(2, 1 / 2, fr.Fraction(1, 2), 8) s1 # Fraction subject can keep the fraction format # Check if $2$ is in the set $S_1$ # In[4]: x = 2 if x in s1: print("{} is in the set!".format(x)) # Convert a list into a set, the list needs unpacking with $*$. # In[5]: s3_list = [1, 3, 5, 7, 8] s3 = sy.FiniteSet(*s3_list) s3 # ## Subsets, Supersets and Power Sets # Now define two sets, $a$ is a **superset** of $b$ ($a\supseteq b$), then $b$ is called a **subset** of $a$ ($b\subseteq a$). # In[6]: a = sy.FiniteSet(1, 2, 3, 4, 5) b = sy.FiniteSet(2, 3, 4) # In this example, $a$ is a **proper superset** of $b$ denoted as $a\supset b$, $b$ is a **proper subset** of $a$ denoted as $b\subset a$. # # The methods ```is_subset``` and ```is_superset``` are doing exactly as their names say. # In[7]: b.is_subset(a) # In[8]: a.is_superset(b) # The **powerset** is set contains all subsets, including the empty set and itself. # In[9]: b.powerset() # Any sets are their own subset, but not their proper subset. # In[10]: b.is_subset(b) # In[11]: b.is_proper_subset(b) # ## Set Operations # Define two sets again. # In[12]: a = sy.FiniteSet(1, 2, 3) b = sy.FiniteSet(2, 3, 4) display(a) display(b) # **Union** and **intersection** are sympy object methods, the same as we've learned in high school. # In[13]: display(a.union(b)) display(a.intersect(b)) # ## Python Built-in Sets And Operations # Sets operations are fairly common in general programming, therefore Python has built-in operations for sets. Here are examples using built-in function. # # We define sets in Python with ```{}```, which are the same for the dictionaries. # In[14]: a = {1, 3, 5, 7, 9} b = {1, 7, 10} # In[15]: type(a) # Union operation. # In[16]: a | b # or # In[17]: a.union(b) # Intersection. # In[18]: a & b # Or # In[19]: a.intersection(b) # Set difference. # In[20]: a - b # In[21]: a.difference(b) # ## Cartesian Product # The famous **Cartesian product** is defined mathematically as below. # # Two sets multiply each other, the result presents all the possible ordered paired, the first element from $A$, the second element from $B$. # $$ # A\times B=\{(a,b)\mid a\in A\ {\mbox{ and }}\ b\in B\} # $$ # A visual example would be helpful. Define two sets, then compute the Cartesian product by multiplication. # In[22]: x = sy.FiniteSet(*list(range(1, 6))) y = sy.FiniteSet(*list(range(2, 7))) z = x * y # Cartesian producet z # In[23]: fig, ax = plt.subplots(figsize=(7, 7)) for i in z: ax.scatter(i[0], i[1], s=100) ax.grid(True) ax.set_title("Cartesian Product", size=15) plt.show() # Here's a more concrete example of Cartesian product, suppose H&M has a type of dress with different parameters, a Cartesian product will show them all combinations they could have. For instance, a dress with parameters $( 'blue', 'polyester')$. # In[24]: colours = sy.FiniteSet("red", "black", "blue", "white") material = sy.FiniteSet("wool", "cotton", "polyester") colours * material # ## Probability and Event # After demonstrating so many sets theory, but how are they used in probability theory? # # Dice rolling problem never gets old. To answer the question: _what is the probability of rolling an odd number_? We will show how to answer the question with set operations. # In[25]: s = sy.FiniteSet(1, 2, 3, 4, 5, 6) odd = sy.FiniteSet(1, 3, 5) p = len(odd) / len(s) p # This is vastly intuitive, just the proportion of odd sides over all sides. # ## Two Dice Problem # Two dice problem is slightly more complicated. Now we are asking: _what's the probability of getting a $7$ while rolling them together_. # Create a dictionary to hold the Cartesian product of two dice and its sum. The ```.update``` function is for adding elements in the dictionary if the updated key didn't exist. # In[26]: dice_cartesian = {} for i in range(1, 7): for j in range(1, 7): dice_cartesian.update({(i, j): i + j}) dice_cartesian # Python dictionary has a method ```.items()``` that lists all key-value pairs in tuples. We'll make use of this method in the loop below. # In[27]: dice_cartesian.items() # Use ```defaultdict``` from ```collections``` module, it creates a dictionary which doesn't report errors and suitable for counting. We pass ```list``` as the _default factory_, meaning initialising values as lists whenever the key is given. # In[28]: from collections import defaultdict dice_count = defaultdict(list) for i, j in dice_cartesian.items(): dice_count[j].append(i) dice_count # Create another dictionary holding all sums of dice and corresponding probabilities. # In[29]: Prob = {i: round(len(j) / 6**2, 4) for i, j in dice_count.items()} Prob # The example above actually is more about Python techniques, it also can be conveniently solved without making a fuss, as below: # In[30]: def dice_prob(number): dice1, dice2 = list(range(1, 7)), list(range(1, 7)) cartesian_dice = [ (i, j) for i in dice1 for j in dice2 ] # list comprehension to create Cartesian product ocurrence = 0 for element in cartesian_dice: if np.sum(list(element)) == number: ocurrence += 1 print( "The probability of {} while rolling two dice is {:.2f}%".format( number, ocurrence / 6**2 * 100 ) ) # In[31]: dice_prob(7) # ## Combination And Permutation # Again, high school skills, two question to help differentiate them. #
    #
  1. Pick 3 letters from English alphabet, how many ways to choose? Use combination.
  2. #
  3. Pick 3 letters from English alphabet to construct a word (doesn't have to be meaningful), how many ways to choose? Use permutation.<\li> #
# Just remember: whenever needs certain order, use permutation, like the second question above, 'dog' and 'god' are different words though letters are the same. We don't need to memorise formula, but still FYI # $$ # _nC_N = \frac{N!}{n!(N-n)!}\\ # _nP_N = \frac{N!}{(N-n)!} # $$ # In[32]: scipy.special.comb(26, 3) # combination # In[33]: scipy.special.perm(26, 3) # permutation # # Conditional Probability, Multiplication Law and Independence # All above are just warm-up, from here on comes the real deal of probability theory. # The probability of A given B, so called **conditional probability**, defined as: # # \begin{equation} # \frac{P(A\cap B)}{P(B)}=P(A|B) # \end{equation} # # It is best to be demonstrated by a joint probability table, which is essentially a discrete numeric form of 3D distribution. # # Here I want to introduce a module of draw text-style table, particular useful when you working on shells. Install in jupyter with ```conda install -c conda-forge texttable```. # In[34]: table = Texttable() table.set_cols_align(["l", "c", "c", "c"]) table.set_cols_valign(["m", "m", "m", "m"]) table.add_rows( [ ["", "Have Ht. Disease", " Not Have ", " Total "], ["Male", 0.45, 0.06, 0.51], ["Femal", 0.36, 0.13, 0.49], ["Total", 0.81, 0.19, 1], ] ) print(table.draw()) # **Marginal probability** is the key concepts, which got its name because located on the margin of a table, it is the sum of all probabilities from the same column or row. # # The marginal probability of having heart disease is $81\%$ and marginal probability of not having is $19\%$. # # Here is the question: _what is the probability of not having heart disease given the person is a woman_? Because the condition is that _the person must be a woman_. So the first step is to narrow down the table. # In[35]: table = Texttable() table.set_cols_align(["l", "c", "c", "c"]) table.set_cols_valign(["m", "m", "m", "m"]) table.add_rows( [["", "Have Ht. Disease", " Not Have ", " Total "], ["Femal", 0.36, 0.13, 0.49]] ) print(table.draw()) # So the conditional probability of a woman not having heart disease is # $$ # \frac{P(A\cap B)}{P(B)} = \frac{.13}{.49} # $$ # In[36]: P_female = 0.49 # the marginal prob of being a women, this is P(B) P_female_no_disease = 0.13 P_con = P_female_no_disease / P_female print( "The probability of not having heart disease given the person is a woman is {0:.2f}%".format( P_con * 100 ) ) # From conditional probability it is straightforward to deduct **multiplication law**, simply rearrange the formula # $$ # P(A|B)P(B)=P(A\cap B)\\ # P(B|A)P(A)=P(B\cap A) # $$ # Two events $A$ and $B$ are independent if # $$ # P(A|B)=P(A)\\ # P(B|A)=P(B) # $$ # # Otherwise, the events are dependent. # # Then the **independent multiplication law** indicates # $$ # P(A)P(B)=P(A\cap B)\\ # P(B)P(A)=P(B\cap A) # $$ # We will not dive deep into probability problems for now, there will be a full-round training session on
**Probability Theory**. # # Distributions That You Should Know # These are most important probability distributions that you should know by heart. # ## Binomial Distribution # The first one is binomial distribution, we will give a it more extensive coverage, other distributions have similar functions in Python. # # A binomial experiment has 4 features: # * A sequence of $n$ identical trials, e.g. throwing darts # * Only two outcomes are possible: _success_ or _failure_, e.g. hitting bullseye or not # * The probability of success $p$ does not change from trial to trial # * Trials are independent events, e.g. first throw doesn't affect the second throw # The **probability mass function(PMF)** of binomial distribution is # #

# $$ # f(k,n,p)=_nC_k p^kq^{n-k} # $$ #

# # # # # # # # # # # # # # # # # # # # # # #
parametersmeaning
$n$number of trials
$k$number of specific outcome
$p$probability of success
$q$probability of failure
# Here's a simple example. # # A personal banker might meet 50 people enquiring for loan monthly, empirically 30% of them has bad credit history. So calculate probability from 1 to 50 people has bad credit history, meaning calculate 1 person out of 50 has bad credit, 2 persons out of 50 have bad credit, so on so forth till 50 persons (all of them). # Start from a single number could be more intuitive, what's probability that a personal banker to encounter exact $14$ persons of bad credit history in a month? # In[37]: n = 50 k = 14 # what is the prob that exact 14 ppl she met had bad credit history? b = scipy.special.comb(50, 14) p = 0.3 f_binomial = b * p**k * (1 - p) ** (n - k) print( "The prob of meeting {0} persons who has bad credit history is {1:.2f}%.".format( k, f_binomial * 100 ) ) # We can use ```scipy.stats.binom.pmf``` to get an array of PMF. # In[38]: n = 50 p = 0.3 bad_credit_person = np.arange(1, 51) prob = sp.stats.binom.pmf(bad_credit_person, n, p) fig, ax = plt.subplots(figsize=(12, 4)) ax.plot(bad_credit_person, prob, lw=3, color="r", alpha=0.5) ax.set_ylim([0, 0.13]) ax.set_title( "The probability that from 1 to 50 persons who have bad credit history", size=16, x=0.5, y=1.02, ) ax.set_xlabel("Number of Person Who Has Bad Credit History", size=12) ax.set_ylabel("Probability", size=12) plt.show() # We could interpret the plot with straightforward observation: #

  • Most likely the personal banker would encounter $10$ to $20$ persons who have bad credit history.
  • #
  • Encountering less than $5$ or more than $25$ person are less likely.
  • #
  • The highest possibility is to encounter $50\times .03 = 15$ persons with bad credit.
  • # Next example we can formulate a question by using **cumulative probability distribution**, the SciPy function is ```scipy.stats.binom.cdf```. # # If a stock trader trades $n$ times a month, he has a $p%$ chance of winning the trade, find out the probability that he can win less than $k$ trades a month. # In[39]: n = 20 p = 0.55 k = 12 k1, k2 = 14, 4 win_less = sp.stats.binom.cdf(k, n, p) win_more = 1 - sp.stats.binom.cdf(k, n, p) win_betw = sp.stats.binom.cdf(14, n, p) - sp.stats.binom.cdf(4, n, p) print( "If a trader's winning rate is {:.0f}%, the probability of winning less than {} times is {:.1f}% if he trades {} per month.".format( p * 100, k, win_less * 100, n ) ) print( "If a trader's winning rate is {:.0f}%, the probability of winning more than {} times is {:.1f}% if he trades {} per month.".format( p * 100, k, win_more * 100, n ) ) print( "If a trader's winning rate is {:.0f}%, the probability of winning between {} and {} times is {:.1f}% if he trades {} per month.".format( p * 100, k1, k2, win_betw * 100, n ) ) # Or present in the text table. # In[40]: table = Texttable() table.set_cols_align(["c", "c", "c"]) table.set_cols_valign(["m", "m", "m"]) table.add_rows( [["Win Less", " Win More ", " Win btw 4~14 "], [win_less, win_more, win_betw]] ) print(table.draw()) # What if the probability of wining changing from 0.1 to 0.8, what is the probability that he wins less than 6 trades, assuming every month he trades 20 times. # In[41]: win_rate = np.arange(0.1, 0.81, 0.05) win_less = sp.stats.binom.cdf(6, 20, win_rate) data_dict = { "win_rate": win_rate, "win_less": win_less, } # I am using annotation in matplotlib, so Pandas is used for easy tracking data points df = pd.DataFrame(data_dict) df # According to the table above, if the trader has at least $60\%$ winning rate, merely $0.6\%$ probability that he wins less then $6$ trades. Marginally higher winning rates are even important for traders in the short run. # In[42]: fig, ax = plt.subplots(figsize=(10, 6)) ax.scatter(win_rate, win_less) txt = "This point means that if the winning rate is 40%,\n then the probability of winning less\n than 6 trades is 25%." ax.annotate( txt, xy=(df.iloc[6][0], df.iloc[6][1]), xytext=(0.35, 0.5), weight="bold", color="r", size=14, arrowprops=dict(arrowstyle="->", connectionstyle="arc3", color="b"), ) ax.set_xlabel("Winning Rate") ax.set_ylabel("Probabilit of winning Less than 6 trades") plt.show() # ### Binomial Random Variable Generator # In Scipy library, every distribution has a random variable generator named ```.rvs```, we set parameters, the generator will return the randomly generated numbers. # In[43]: n = 100 p = 0.3 bino = sp.stats.binom.rvs(n, p, size=10000) # The red vertical line in the graph below is the mean, because binomial distribution is a symmetric distribution, thus the mean should theoretically have the highest draw as well. # In[44]: txt = "This line is the $y =p \cdot n$ \n and where theoretical highest draw should be." fig, ax = plt.subplots(figsize=(10, 6)) h, bins, patches = ax.hist(bino, bins=160) ax.axvline(p * n, color="r", ls="--", lw=3) ax.annotate( txt, xy=(p * n, h.max() * 0.4), xytext=(p * n, h.max() * 0.7), weight="bold", color="r", size=14, arrowprops=dict(arrowstyle="->", connectionstyle="arc3", color="b"), ) ax.set_title("Generated Binomial R.V.s", size=18) plt.show() # But how to interpret the histogram? # # A concrete example: you are trying to shoot basketball into the basket, your chances of success is $30\%$, what each day you can shoot $100$ rounds, but in the long run, you are mostly like to have $30$ successes each day, and it would be extremely unlikely that you would land a $50$-success. # ### Moments of Binomial Distribution # To return moments, use ```.stats``` functions with moments = `msvk`, it is short for _mean_, _skewness_, _variance_ and _kurtosis_. # In[45]: n = 100 p = 0.3 bino_stats = sp.stats.binom.stats(n, p, moments="mvsk") table = Texttable() table.set_cols_align(["c", "c", "c", "c"]) table.set_cols_valign(["m", "m", "m", "m"]) table.add_rows( [ ["mean", " variance ", " skewness ", "kurtosis "], [bino_stats[0], bino_stats[1], bino_stats[2], bino_stats[3]], ] ) print(table.draw()) # ## Poisson Distribution # When $n\rightarrow\infty$ and $p\rightarrow0$,a binomial distribution approaches a **Poisson distribution** asymptotically, i.e. when $n$ is large and $p$ is small, we can use Poisson to approximate Binomial. # # Again with trader's example, if a trader has $1/1000$ probability to encounter a 'wiped-out' in each trade (assume each trade is independent, actually not), and trades $20$ times per month, what is the probability that the trader will encounter twice 'wiped-out' within 5 years? # # This problem can be solved by Binomial, the formulation as below # # $$ # \text{trades} = 20\times 12\times 5=1200\\ # P(x=2) = \binom{1200}{2}\Big(\frac{1}{1000}\Big)^2\Big(\frac{999}{1000}\Big)^{1198} # $$ # In[46]: sp.special.comb(1200, 2) * 1 / 1000**2 * (999 / 1000) ** 1198 # The result tells that if a trader keep a frequency of $20$ trades per month, there $21\%$ possibility that he/she gets wiped out twice in next $5$ years. # As we mentioned, Poisson is the limit version of Binomial, it is a suitable case to use, calculate $\lambda$ # # \begin{equation} # \lambda = np = 1200 \times \frac{1}{1000} = 1.2 # \end{equation} # # it means every 5 years, there is in average 1.2 times of chance to get wiped out. # # \begin{equation} # P(x=2)=\frac{\lambda^ke^{-\lambda}}{k!}=\frac{1.2^2e^{-1.2}}{2!} # \end{equation} # # # Formulate in Python # In[47]: k = 2 n = 20 * 12 * 5 # 20 times per month, and 5 years span p = 1 / 1000 lambdaP = p * n # lambda in Poisson p = sp.stats.poisson.pmf(k, lambdaP) print( "The probability of having {0} wiped-out shock(s) in a span of 5 years is {1:.2f}%.".format( k, p * 100 ) ) # You probably have notices that Binomial and Poisson provide the same answer. # # Similarly what's the probability of having more than $k$ times wiped-out? # In[48]: k = 2 prob = 1 - sp.stats.poisson.cdf(k, lambdaP) print( "The probability of having more than %1.0f BS shock in 5 years is %3.3f%%." % (k, prob * 100) ) # ### Poisson Random Variable Generator # Use the parameters $\lambda = 1.2$, we generate a frequency distribution with ```sp.stats.poisson.rvs``` function. # In[49]: lambdaP = 1.2 poisson = sp.stats.poisson.rvs(lambdaP, size=1000) fig, ax = plt.subplots(figsize=(10, 6)) h, bins, patches = ax.hist(poisson, bins=10) ax.set_title("Generated Poisson R.V. Distribution", size=16) plt.show() # ### Moments of Poisson Distribution # In[50]: lambdaP = 1.2 poiss_stats = sp.stats.poisson.stats( lambdaP, moments="mvsk" ) # mean, variance, skewness, kurtosis table = Texttable() table.set_cols_align(["c", "c", "c", "c"]) table.set_cols_valign(["m", "m", "m", "m"]) table.add_rows( [ ["mean", " variance ", " skewness ", "kurtosis "], [poiss_stats[0], poiss_stats[1], poiss_stats[2], poiss_stats[3]], ] ) print(table.draw()) # ## Geometric Distribution # The PMF of **Geometric Distribution** is # $$ # f(k)=p(1-p)^k # $$ # # where $k \in \mathbb{Z}^+$ is a number of failures before first success, and $p$ is the probability of success. # # Geometric Distribution is to model the solutions to questions: '_How many times you have to fail in order to embrace the initial success?_' # # If you are shooting basketball to the basket, your success rate is $30\%$, what's the probability of first success after $k$ times of trials? # In[51]: k = 5 p = 0.3 geo_dist = (1 - p) ** k * p print( "The probability of observing exact {} times of failures trials before first success is {:.2f}%.".format( k, geo_dist * 100 ) ) # ```sp.stats.geom.pmf``` function will do the same trick. # In[52]: sp.stats.geom.pmf(k, p) # However, you have noticed the same parameters didn't produce the same result. That's because Scipy has a slightly different definition of parameter, $k$ in Scipy means the total number of trials, therefore $k+1$ would work the same. # In[53]: sp.stats.geom.pmf(k + 1, p) # Again, CDF could answer the a question: _What's the probability of observing $k$ or less than $k$ times of failure before the first success?_ # In[54]: geom_cdf = sp.stats.geom.cdf(k + 1, p) print( "The probability of observing %1.0f or fewer than %1.0f times of failure before a first success is %3.3f%%." % (k, k, geom_cdf * 100) ) # ### Geometric Distribution Moments and Generator # In[55]: mean, var, skew, kurt = sp.stats.geom.stats(p, moments="mvsk") table = Texttable() table.set_cols_align(["c", "c", "c", "c", "c", "c"]) table.set_cols_valign(["m", "m", "m", "m", "m", "m"]) table.add_rows( [ ["p", "k", "mean", " variance ", " skewness ", " kurtosis "], [p, k, mean, var, skew, kurt], ] ) print(table.draw()) # In[56]: geometric = sp.stats.geom.rvs(p, size=10000) fig, ax = plt.subplots(figsize=(10, 6)) h, bins, patches = ax.hist(geometric, bins=30) ax.set_title("Generated Geometric R.V. Distribution", size=16) plt.show() # ## Hypergeometric Distribution # The main difference between hypergeometric and binomial is that the former's sampling is not independent of each other, i.e. the sampling is **without replacement**. # # The PMF of hypergeometric is # # $$ # f(x) =\frac{{K\choose k} {N-K \choose n-k}}{{N\choose n}} # $$ # # Read the PMF with this example: $100$ people live in a building, $20$ of them are stashing drugs, $80$ are clean, but we don't have information who is clean or not. In one field operation, police took away $5$ persons form the building, what is the probability of having exact $2$ persons .are drug stasher? # # Solution: # $$ # \frac{{20\choose2}{80\choose3}}{{100\choose5}} # $$ # To solve it: # In[57]: k = 2 n = 5 K = 20 N = 100 hyper_geo = ( sp.special.comb(K, k) * sp.special.comb(N - K, n - k) / sp.special.comb(N, n) ) print( "The probability of getting {} drug stashers by taking {} persons away is {:.2f}%.".format( k, n, hyper_geo * 100 ) ) # Or use SciPy function # In[58]: # pmf(x, M, N, n, loc=0) hgeo = sp.stats.hypergeom.pmf( k, N, K, n, loc=0 ) # the argurment order the same as MATLAB, quite confusing hgeo # A histogram would provide some intuitions of geometric distribution. # In[59]: hgeo_rv = sp.stats.hypergeom.rvs(100, 20, 5, size=10000) fig, ax = plt.subplots(figsize=(10, 6)) h, bins, patches = ax.hist(hgeo_rv, density=True) ax.set_title("Generated Hypergeometric R.V. Distribution", size=16) s = """ It can be interpreted as: if 100 persons in the building, 20 are drug stasher, take 5 out of 100. The probability of getting from 1 to 5 drug stashers, is shown in the chart. As we can see it is nearly impossible to get 4 or 5 drugs stasher. But getting one is the most possible outcome.""" ax.text(1.6, 0.5, s, fontsize=14, color="red") plt.show() # In[60]: mean, var, skew, kurt = sp.stats.hypergeom.stats(N, K, n, moments="mvsk") table = Texttable() table.set_cols_align(["c", "c", "c", "c", "c", "c"]) table.set_cols_valign(["m", "m", "m", "m", "m", "m"]) table.add_rows( [ ["N", "k", "mean", " variance ", " skewness ", " kurtosis "], [N, k, mean, var, skew, kurt], ] ) print(table.draw()) # ## Discrete Uniform Distribution # Rolling a die is the simplest **discrete uniform distribution** generator from $1$ to $6$. # In[61]: unif_d = sp.stats.randint.rvs(low=0, high=6, size=1000) fig, ax = plt.subplots(figsize=(10, 6)) h, bins, patches = ax.hist(unif_d, bins=6, ec="k") ax.set_title("Discrete Uniform Frequency Distribution", size=16) plt.show() # ### Discrete Probability Mass Function # In[62]: x = np.arange(0, 8) l, h = 1, 7 unif_pmf = sp.stats.randint.pmf(x, low=l, high=h) fig, ax = plt.subplots(figsize=(10, 6)) ax.scatter( x, unif_pmf, s=100, color="green", label="Low = {}, High = {}".format(l, h - 1) ) ax.plot(x, unif_pmf, lw=3, color="k") ax.legend(fontsize=18, loc="center") plt.show() # ## Continuous Uniform Distribution # The PDF of *Continuous uniform distribution** is # # \begin{equation} # f(x)=\frac{1}{b-a} # \end{equation} # # And its r.v. generator is one of the most commonly used function in NumPy. # In[63]: unif = np.random.rand(10000) fig, ax = plt.subplots(figsize=(10, 6)) h, bins, patches = ax.hist(unif, density=True, bins=30) ax.set_title("Continous Uniform Frequency Distribution", size=16) plt.show() # ### CDF and PDF of Continuous Uniform Distribution # The CDF and PDF of uniform distribution are ```sp.stats.uniform.cdf``` and ```sp.stats.uniform.pdf``` accordingly. # In[64]: # pdf(x, loc=0, scale=1) # cdf(x, loc=0, scale=1) x = np.linspace(-0.2, 1.2, 100) unif_pdf = sp.stats.uniform.pdf(x) unif_cdf = sp.stats.uniform.cdf(x) fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 6)) ax[0].plot(x, unif_pdf, lw=4, label="PDF of Continous Uniform Distribution") ax[0].set_xlim([-0.1, 1.1]) ax[0].set_ylim([0, 2]) ax[0].legend(fontsize=16) ax[1].plot(x, unif_cdf, lw=4, label="CDF of Continous Uniform Distribution") ax[1].set_xlim([-0.2, 1.2]) ax[1].set_ylim([0, 2]) ax[1].legend(fontsize=16) plt.show() # In[65]: mean, var, skew, kurt = sp.stats.uniform.stats(moments="mvsk") table = Texttable() table.set_cols_align(["c", "c", "c", "c"]) table.set_cols_valign(["m", "m", "m", "m"]) table.add_rows( [["mean", " variance ", " skewness ", " kurtosis "], [mean, var, skew, kurt]] ) print(table.draw()) # ## Normal Distribution # The **normal distribution** is the king of all distributions, there are extensive discussions in my Linear Algbra Notes. The PDF of Normal distribution is # $$ # f(x)=\frac{1}{\sigma \sqrt{2 \pi}} e^{-\frac{1}{2}\left(\frac{x-\mu}{\sigma}\right)^{2}} # $$ # Below are the plots of CDF and PDF. # In[66]: mu = 2 sigma = 1 x = np.arange(-2, 6, 0.1) norm_pdf = sp.stats.norm.pdf(x, mu, sigma) norm_cdf = sp.stats.norm.cdf(x, mu, sigma) fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 6)) ax[0].plot(x, norm_pdf, lw=4, label="PDF of Normal Distribution", ls="--") ax[0].legend(fontsize=16, loc="lower left", framealpha=0.2) ax[1].plot(x, norm_cdf, lw=4, label="CDF of Normal Distribution") ax[1].legend(fontsize=16, fancybox=True, framealpha=0.5) plt.show() # ### Inverse Normal CDF # Here is a useful plot that you might have known by heart. The light blue area cover $95\%$ of area under the bell curve, darker blue area on either side covers $2.5\%$ respectively. If we want to search the point where $2.5\%$ area is on its left, we simply need a inverse function of CDF, in SciPy ```.ppf``` (point percentage function) can provide us the location of any percentage that we specify. # In[67]: norm_95_r = sp.stats.norm.ppf( 0.975 ) # ppf mean point percentage function, actually inverse CDF norm_95_l = sp.stats.norm.ppf(0.025) x = np.linspace(-5, 5, 200) y = sp.stats.norm.pdf(x) xl = np.linspace(-5, norm_95_l, 100) yl = sp.stats.norm.pdf(xl) xr = np.linspace(norm_95_r, 5, 100) yr = sp.stats.norm.pdf(xr) fig, ax = plt.subplots(figsize=(10, 6)) ax.plot(x, y, lw=4, label="PDF of Normal Distribution", ls="-", color="#CC4845") ax.set_ylim([0, 0.45]) ax.fill_between(x, y, 0, alpha=0.6, color="#2E86C3") ax.fill_between(xl, yl, 0, alpha=0.9, color="#2E86C3") ax.fill_between(xr, yr, 0, alpha=0.9, color="#2E86C3") ax.text(-0.2, 0.15, "95%", fontsize=20) ax.text(-2.3, 0.015, "2.5%", fontsize=12, color="white") ax.text(2.01, 0.015, "2.5%", fontsize=12, color="white") ax.annotate( "±%.4f" % norm_95_r, xy=(norm_95_r, 0), xytext=(-0.4, 0.05), weight="bold", color="#CC4845", arrowprops=dict(arrowstyle="->", connectionstyle="arc3", color="#757516"), fontsize=16, ) ax.annotate( "±%.4f" % norm_95_r, xy=(norm_95_l, 0), xytext=(-0.4, 0.05), weight="bold", color="#CC4845", arrowprops=dict(arrowstyle="->", connectionstyle="arc3", color="#757516"), fontsize=16, ) ax.set_title("Normal Distribution And 2.5% Shaded Area", size=20) plt.show() # ### Normal Random Variable Generator # In[68]: # rvs(loc=0, scale=1, size=1, random_state=None) norm_rv = sp.stats.norm.rvs(mu, sigma, size=5000) fig, ax = plt.subplots(figsize=(10, 6)) h, bins, patches = ax.hist(norm_rv, density=True, bins=50) ax.set_title("Normal Frequency Distribution", size=16) plt.show() # In[69]: mean, var, skew, kurt = sp.stats.norm.stats(mu, sigma, moments="mvsk") table = Texttable() table.set_cols_align(["c", "c", "c", "c"]) table.set_cols_valign(["m", "m", "m", "m"]) table.add_rows( [["mean", " variance ", " skewness ", " kurtosis "], [mean, var, skew, kurt]] ) print(table.draw()) # ## Bivariate Normal Distribution # The most prevalent _multivariate normal distribution_ is **bivariate normal distribution** # # \begin{equation} # f_\boldsymbol{X}(x_1,x_2)=\frac{1}{\sqrt{(2\pi)^2|\Sigma|}}\exp{\Big(-\frac{(X-\mu)^T\Sigma^{-1}(X-\mu)}{2}\Big)} # \end{equation} # #
    # There are two ways of formulating a multivariate normal distribution in Python, use any one you see fit. # ### 1st Method of Plotting Formulation # In[70]: mu_x = 0 sigma_x = 2 mu_y = 0 sigma_y = 2 # Create grid and multivariate normal x = np.linspace(-10, 10, 500) y = np.linspace(-10, 10, 500) X, Y = np.meshgrid(x, y) pos = np.empty(X.shape + (2,)) pos[:, :, 0] = X pos[:, :, 1] = Y # more technical than next one norm = sp.stats.multivariate_normal( [mu_x, mu_y], [[sigma_x, 0], [0, sigma_y]] ) # frozen # Make a 3D plot fig, ax = plt.subplots(figsize=(7, 7), subplot_kw={"projection": "3d"}) ax.plot_surface(X, Y, norm.pdf(pos), cmap="viridis", linewidth=0) ax.set_xlabel("X axis") ax.set_ylabel("Y axis") ax.set_zlabel("Z axis") ax.set_title("Bivariate Normal Distribution", size=22, y=1) plt.show() # ### 2st Method of Plotting Formulation # In[71]: # Parameters to set mu_x = 0 sigma_x = 7 mu_y = 0 sigma_y = 15 x = np.linspace(-10, 10, 500) y = np.linspace(-10, 10, 500) X, Y = np.meshgrid(x, y) position = np.array([X.flatten(), Y.flatten()]).T # more intuitive than former one binormal_obj = sp.stats.multivariate_normal([mu_x, mu_y], [[sigma_x, 0], [0, sigma_y]]) fig, ax = plt.subplots(figsize=(7, 7)) ax.contourf(X, Y, binormal_obj.pdf(position).reshape(500, 500), cmap="viridis") plt.show() # ## Beta Distribution # The PDF of **Beta distribution** is # # \begin{equation} # f(x, a, b)=\frac{\Gamma(a+b) x^{a-1}(1-x)^{b-1}}{\Gamma(a) \Gamma(b)} # \end{equation} # # where $0\leq x \leq 1$ and $a>0$, $b>0$, $\Gamma$ is the Gamma function, and $a$ and $b$ decide the shape of Beta PDF. # In[72]: x = np.linspace(0, 1, 100) fig = plt.figure(figsize=(7, 7)) ax = fig.add_subplot(111) a = [0.5, 5, 1, 2, 2, 3, 1] # these are values of parameter a b = [0.5, 1, 5, 2, 3, 2, 1] for parameter in zip(a, b): beta_pdf = sp.stats.beta.pdf(x, parameter[0], parameter[1]) ax.plot( x, beta_pdf, lw=3, label="$a = %.1f, b = %.1f$" % (parameter[0], parameter[1]) ) ax.legend() ax.set_title("PDF of Beta Distribution", size=20) ax.axis([0, 1, 0, 3]) plt.show() # In[73]: x = np.linspace(0, 1, 100) fig = plt.figure(figsize=(7, 7)) ax = fig.add_subplot(111) a = [0.5, 5, 1, 2, 2, 3, 1] # these are values of parameter a b = [0.5, 1, 5, 2, 3, 2, 1] for parameter in zip(a, b): beta_pdf = sp.stats.beta.cdf(x, parameter[0], parameter[1]) ax.plot( x, beta_pdf, lw=3, label="$a = %.1f, b = %.1f$" % (parameter[0], parameter[1]) ) ax.legend() ax.set_title("CDF of Beta Distribution", size=20) ax.axis([0, 1, 0, 1]) plt.show() # Beta distribution is mostly useful as **prior distribution** in Bayesian estimation, because it is bounded in $[0, 1]$, that is perfect for modeling the _probability distribution of probabilities_. In Bayesian estimation, we only care about the proportion between prior and posterior, by adjusting $a$ and $b$ we can use Beta distribution to express any kinds of prior beliefs including normal, uniform, exponential distribution and etc. # # \begin{equation} # f(x, a, b)\propto x^{a-1}(1-x)^{b-1} # \end{equation} # ## $\chi^2$ Distribution # $\chi^2$ distribution is closely connected with normal distributions, if $z$ has the standard normal distribution, then $z^2$ has the $\chi^2$ distribution with $d.f.=1$. And further,if # # \begin{equation} # z_1, z_2, ..., z_k \sim i.i.d. N(0, 1) # \end{equation} # # Then summation has a $\chi^2$ distribution of $d.f. = k$: # # \begin{equation} # \sum_{i=0}^k z_i^2 \sim \chi^2(k) # \end{equation} # # We will see in later chapters how $\chi^2$ distribution is referred when performing hypothesis testing. # ### $\chi^2$ PDF and CDF # In[74]: k = 1 x = np.linspace(0, 5, 100) chi_pdf = sp.stats.chi2.pdf(x, k) fig, ax = plt.subplots(figsize=(10, 6)) ax.plot(x, chi_pdf, lw=3, c="r", label="$\chi^2$ distribution with d.f. = 1") ax.legend(fontsize=18) plt.show() # In[75]: fig, ax = plt.subplots(figsize=(10, 6)) for i in range(1, 6): x = np.linspace(0, 5, 100) chi_pdf = sp.stats.chi2.pdf(x, i) ax.plot(x, chi_pdf, lw=3, label="$\chi^2$ distribution with d.f. = %.0d" % i) ax.legend(fontsize=12) ax.axis([0, 5, 0, 1]) plt.show() # ## F Distribution # If $U_1$ has a $\chi^2$ distribution with $\nu_1$ d.f. and $U_2$ has a $\chi^2$ distribution with $\nu_2$ d.f., then # # \begin{equation} # \frac{U_1/\nu_1}{U_2/\nu_2}\sim F(\nu_1, \nu_2) # \end{equation} # # We are using $F$ distribution for ratios of variances. # In[76]: x = np.linspace(0, 5, 100) fig, ax = plt.subplots(figsize=(10, 6)) df1 = [10, 5, 8, 15] df2 = [5, 10, 15, 8] for df in zip(df1, df2): f_pdf = sp.stats.f.pdf(x, dfn=df[0], dfd=df[0]) ax.plot(x, f_pdf, lw=3, label="$df_1 = %.d, df_2 = %.d$" % (df[0], df[1])) ax.legend(fontsize=15) plt.show() # $\chi^2$ and $F$ distribution are mostly used for statistical testing, we will elaborate the topic later. # ## Student's t Distribution # The **t-distribution** is used when data are approximately normally distributed, which means the data follow a bell shape but the population variance is unknown. We will come back to this topic in chapter of estimation. # # The PDF of t-distribution is derived from normal and $\chi^2$ distribution # $$ # f(t)=\frac{\Gamma\left(\frac{\nu+1}{2}\right)}{\sqrt{\nu \pi} \Gamma\left(\frac{\nu}{2}\right)}\left(1+\frac{t^{2}}{\nu}\right)^{-\frac{\nu+1}{2}} # $$ # where $\Gamma(\cdot)$ is the Gamma function that # $$ # \Gamma(x) = (x-1)! # $$ # # So far we just need a visual memory about the comparison of t-distribution and normal distribution. # If a sample has $n$ observations, the degree of freedom (d.o.f.) of t-distribution is $n-1$, the larger the d.o.f. the shape is closer to normal distribution. # In[77]: x = np.linspace(-3, 3, 100) y_norm = sp.stats.norm.pdf(x, loc=0, scale=1) fig, ax = plt.subplots(figsize=(10, 6)) ax.plot(x, y_norm, label="Normal Distribution") for i in range(1, 11): y_t = sp.stats.t.pdf(x, df=i, loc=0, scale=1) ax.plot( x, y_t, color="tomato", alpha=0.1 * i, label="t-Distribution with dof = {}".format(i), ) ax.legend() plt.show() # ## Gamma Distribution # The Gamma distribution is commonly used to model positive variables, such as variance. The common pdf form follows # $$ # f(x)=\frac{1}{\beta^{\alpha}\Gamma(\alpha)} x^{\alpha-1} \exp \left[-\frac{x}{\beta}\right] \text { for } x>0 \text { and } \alpha, \beta>0 # $$ # where $\alpha$ is _shape_ parameter and $\beta$ is _scale_ parameter. Also note that the probability shall be integrated to unity, this implies # $$ # \beta^{\alpha} \Gamma(\alpha)=\int_{0}^{\infty} x^{\alpha-1} \exp \left[-\frac{x}{\beta}\right] d x # $$ # In Bayesian statistics, another formulation of Gamma distribution is more common # $$ # p(x)=\frac{1}{\left(\frac{2 \mu}{v}\right)^{v / 2}} \frac{1}{\Gamma\left(\frac{v}{2}\right)} x^{\frac{v-2}{2}} \exp \left[-\frac{x v}{2 \mu}\right] \text { for } x>0 \text { and } \mu, v>0 # $$ # where # $$ # v = 2\alpha\\ # \mu=\alpha \beta # $$ # The two special form of Gamma distribution are Chi-Square distribution $(m=\mu)$ and Exponential distribution $(v=2)$. # In[78]: x = np.linspace(0, 10, 100) fig, ax = plt.subplots(figsize=(18, 6), nrows=1, ncols=2) for i in range(1, 11): y_gamma = sp.stats.gamma(a=i, scale=1).pdf(x=x) ax[0].plot(x, y_gamma, label=r"$\alpha={}, \beta=1$".format(i)) y_gamma = sp.stats.gamma(a=1, scale=i).pdf(x=x) ax[1].plot(x, y_gamma, label=r"$\alpha=1,\beta={}$".format(i)) ax[0].legend() ax[1].legend() plt.suptitle("Gamma Distribution") plt.show() # ## Multivariate Normal Distribution # Consider a vector of random variables # $$ # X=\left(x_{1}, x_{2}, \cdots, x_{N}\right)^T # $$ # that follows a multivariate normal distribution, the pdf is given by # $$ # f(X)=(2 \pi)^{-N / 2}|\Sigma|^{-1 / 2} \exp \left(-\frac{1}{2}(X-\mu)^T \Sigma^{-1}(X-\mu)\right) \text { for } \sigma>0 # $$ # where # $$ # \mu=\left(\mu_{1}, \mu_{2}, \cdots \mu_{N}\right) # $$ # and $\Sigma$ is an $N\times N$ covariance matrix. # ## Multivariate t Distribution # Consider a vector of random variables # $$ # X=\left(x_{1}, x_{2}, \cdots, x_{N}\right)^T # $$ # that follows a multivariate t distribution, the pdf is given by # $$ # f(X)=\frac{v^{v / 2} \Gamma\left(\frac{v+N}{2}\right)}{\pi^{N / 2} \Gamma\left(\frac{v}{2}\right)}|\Sigma|^{-1 / 2}\left[v+(X-\mu)^{\prime} \Sigma^{-1}(X-\mu)\right]^{-(v+N) / 2} \text { for } v>0 # $$ # where # $$ # \mu=\left(\mu_{1}, \mu_{2}, \cdots \mu_{N}\right) # $$ # and $\Sigma$ is an $N\times N$ covariance matrix, $v$ is the degree of freedom. # In[79]: from scipy.stats import multivariate_t x, y = np.mgrid[-10:10:0.01, -10:10:0.01] pos = np.dstack((x, y)) rv = multivariate_t(loc=[1, -3], shape=[[5, 0.3], [2, 1.5]], df=2) fig, ax = plt.subplots(subplot_kw={"projection": "3d"}, figsize=(10, 7)) ax.view_init(30, 0) ax.plot_surface(x, y, rv.pdf(pos)) plt.show() # ## Multinomial Distribution # The **multinomial distribution** differs from multivariate normal distribution, the former is the generalization of binomial distribution, i.e. multiple categories of 'success'. The PDF is # $$ # \begin{aligned} # \operatorname{Pr}\left(X_{1}=x_{1}, \ldots, X_{k}=x_{k}\right) # &= \begin{cases}\frac{n !}{x_{1} ! \cdots x_{k} !} p_{1}^{x_{1}} \times \cdots \times p_{k}^{x_{k}}, & \text { when } \sum_{i=1}^{k} x_{i}=n \\ # 0 & \text { otherwise }\end{cases} # \end{aligned} # $$ # For instance, in a black bag, there are $10\%$ of red candies, $40\%$ of yellow candies and $50\%$ of blue candies. The probability of catching exactly $2$ red, $3$ yellow and $4$ blue has a probability of # $$ # \frac{(2+3+4)!}{2!3!4!}\times .1^2 \times .4^3 \times .5^4\approx 5\% # $$ # In[85]: import math as mt prob = ( mt.factorial(2 + 3 + 4) / (mt.factorial(2) * mt.factorial(3) * mt.factorial(4)) * 0.1**2 * 0.4**3 * 0.5**4 ) print("The probability is {:.4f}".format(prob)) # ## Dirichlet Distribution # # ## Normal-Gamma Distribution # Consider a vector of random variables # $$ # X=\left(x_{1}, x_{2}, \cdots, x_{N}\right)^T # $$ # and a scalar random variable $h$. The joint pdf for $X$ and $h$ follows a Normal-Gamma distribution if it takes the form # $$ # f(X, h)=(2 \pi)^{-N / 2}(h)^{N / 2}|\Sigma|^{-1 / 2} \exp \left(-\frac{h}{2}(X-\mu)^{\prime} \Sigma^{-1}(X-\mu)\right) \frac{1}{\left(\frac{2 m}{v}\right)^{v / 2}} \frac{1}{\Gamma\left(\frac{v}{2}\right)} h^{\frac{v-2}{2}} \exp \left[-\frac{h v}{2 m}\right] # $$ # The condition pdf of $X$ given $h$ follows a normal distribution, denoted as # $$ # X \mid h \sim N\left(\mu, h^{-1} \Sigma\right) # $$ # And the marginal pdf of $X$ follows a $t$-distribution # $$ # X \sim t\left(\mu, m^{-1} \Sigma, v\right) # $$ # The marginal pdf of $h$ follows a gamma distribution, denoted as # $$ # h \sim \operatorname{Gamma}(m, v) # $$ # # What is Kernel? # A **Kernel** is the core part of a pdf function that cannot be factored out of any terms which do not depend on $X$, e.g. # $$ # P(X)= Kg(X) # $$ # where $g(X)$ is the kernal and $K$ is the constant that ensures integration equals to unity. That also implies # $$ # \int_{X} g(X) d X=\frac{1}{K} # $$