cat ./code/L4/line_count.py

#! /usr/bin/env python3

import sys
count = 0
data = []
for line in sys.stdin:
    count += 1
    data.append(line)

print("Total ",count, "lines read.") # print goes to sys.stdout
#print(data)

cat BDE-L3-git.ipynb | ./code/L4/line_count.py

Total  341 lines read.

cat ./code/L4/hash_check.py

#! /usr/bin/env python3

import re
starts_with_hash = 0

# look at each line in the file use a regex to see if it starts with '#' if it does, add 1
# to the count.

with open('~/lectures/code/L3/line_count.py','r') as file:
    for line in file:
        if re.match("^#",line):
            starts_with_hash += 1
print(starts_with_hash)

#! /usr/bin/python3

data = []
file = open('data/stocks.csv','r')
next(file)
for line in file:
    data.append(line)

print(data[0])

AAPL	2015-01-23	112.98

#! /usr/bin/env python3

import csv

data = {'date':[], 'symbol':[], 'closing_price' : []}
with open('data/stocks.csv', 'r') as f:
    reader = csv.DictReader(f, delimiter='\t')
    for row in reader:
        data['date'].append(row["date"])
        data['symbol'].append(row["symbol"])
        data['closing_price'].append(float(row["closing_price"]))

data.keys()

dict_keys(['date', 'symbol', 'closing_price'])

#! /usr/bin/env python3

import pandas

data2 = pandas.read_csv('data/stocks.csv', delimiter='\t',header=None)
print(len(data2))
print(type(data2))

16556
<class 'pandas.core.frame.DataFrame'>

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import fmin
import seaborn as sns

# Data available at https://www.kaggle.com/puxama/bostoncsv
# Contains information collected by the U.S Census Service concerning housing in the area of Boston Mass. 
boston = pd.read_csv('./data/Boston.csv')
boston.head()

boston.plot(x = 'lstat', y = 'medv', style = 'o', legend=False, ylabel = 'medv')

<AxesSubplot:xlabel='lstat', ylabel='medv'>

sum_of_squares = lambda beta, x, y: np.sum((y - beta[0] - beta[1]*x) ** 2)
sum_of_squares([0,0.7], boston.lstat, boston.medv)

184221.366967

help(fmin)

Help on function fmin in module scipy.optimize.optimize:

fmin(func, x0, args=(), xtol=0.0001, ftol=0.0001, maxiter=None, maxfun=None, full_output=0, disp=1, retall=0, callback=None, initial_simplex=None)
    Minimize a function using the downhill simplex algorithm.
    
    This algorithm only uses function values, not derivatives or second
    derivatives.
    
    Parameters
    ----------
    func : callable func(x,*args)
        The objective function to be minimized.
    x0 : ndarray
        Initial guess.
    args : tuple, optional
        Extra arguments passed to func, i.e., ``f(x,*args)``.
    xtol : float, optional
        Absolute error in xopt between iterations that is acceptable for
        convergence.
    ftol : number, optional
        Absolute error in func(xopt) between iterations that is acceptable for
        convergence.
    maxiter : int, optional
        Maximum number of iterations to perform.
    maxfun : number, optional
        Maximum number of function evaluations to make.
    full_output : bool, optional
        Set to True if fopt and warnflag outputs are desired.
    disp : bool, optional
        Set to True to print convergence messages.
    retall : bool, optional
        Set to True to return list of solutions at each iteration.
    callback : callable, optional
        Called after each iteration, as callback(xk), where xk is the
        current parameter vector.
    initial_simplex : array_like of shape (N + 1, N), optional
        Initial simplex. If given, overrides `x0`.
        ``initial_simplex[j,:]`` should contain the coordinates of
        the jth vertex of the ``N+1`` vertices in the simplex, where
        ``N`` is the dimension.
    
    Returns
    -------
    xopt : ndarray
        Parameter that minimizes function.
    fopt : float
        Value of function at minimum: ``fopt = func(xopt)``.
    iter : int
        Number of iterations performed.
    funcalls : int
        Number of function calls made.
    warnflag : int
        1 : Maximum number of function evaluations made.
        2 : Maximum number of iterations reached.
    allvecs : list
        Solution at each iteration.
    
    See also
    --------
    minimize: Interface to minimization algorithms for multivariate
        functions. See the 'Nelder-Mead' `method` in particular.
    
    Notes
    -----
    Uses a Nelder-Mead simplex algorithm to find the minimum of function of
    one or more variables.
    
    This algorithm has a long history of successful use in applications.
    But it will usually be slower than an algorithm that uses first or
    second derivative information. In practice, it can have poor
    performance in high-dimensional problems and is not robust to
    minimizing complicated functions. Additionally, there currently is no
    complete theory describing when the algorithm will successfully
    converge to the minimum, or how fast it will if it does. Both the ftol and
    xtol criteria must be met for convergence.
    
    Examples
    --------
    >>> def f(x):
    ...     return x**2
    
    >>> from scipy import optimize
    
    >>> minimum = optimize.fmin(f, 1)
    Optimization terminated successfully.
             Current function value: 0.000000
             Iterations: 17
             Function evaluations: 34
    >>> minimum[0]
    -8.8817841970012523e-16
    
    References
    ----------
    .. [1] Nelder, J.A. and Mead, R. (1965), "A simplex method for function
           minimization", The Computer Journal, 7, pp. 308-313
    
    .. [2] Wright, M.H. (1996), "Direct Search Methods: Once Scorned, Now
           Respectable", in Numerical Analysis 1995, Proceedings of the
           1995 Dundee Biennial Conference in Numerical Analysis, D.F.
           Griffiths and G.A. Watson (Eds.), Addison Wesley Longman,
           Harlow, UK, pp. 191-208.

x = boston.lstat
y = boston.medv
b0, b1 = fmin(sum_of_squares, [0,1], (boston.lstat, boston.medv))
b0, b1

Optimization terminated successfully.
         Current function value: 19472.381418
         Iterations: 82
         Function evaluations: 157

(34.55385886222141, -0.9500515380716178)

ax = boston.plot(x='lstat', y='medv', style='o', legend=False, ylabel= 'medv')
ax.plot([0,37], [b0, b0+b1*37])
for xi, yi in zip(x,y):
    ax.plot([xi, xi], [yi, b0+b1*xi], 'k:')

sum_squares_quad = lambda beta, x, y: np.sum((y - beta[0] - beta[1]*x - beta[2]*(x**2)) ** 2)
b0,b1,b2 = fmin(sum_squares_quad, [1,1,-1], args=(x,y))
print('\nintercept: {0:.2}, x: {1:.2}, x2: {2:.2}'.format(b0,b1,b2))
ax = boston.plot(x='lstat', y='medv', style='o', legend=False, ylabel = 'medv')
xvals = np.linspace(0, 37, 100)
ax.plot(xvals, b0 + b1*xvals + b2*(xvals**2))

Optimization terminated successfully.
         Current function value: 15347.243159
         Iterations: 187
         Function evaluations: 342

intercept: 4.3e+01, x: -2.3, x2: 0.044

[<matplotlib.lines.Line2D at 0x7fd5ba6a1e80>]

medals = pd.read_csv('./data/medals.csv')
medals.head()

medals.plot(x='population', y='medals', kind='scatter')

<AxesSubplot:xlabel='population', ylabel='medals'>

medals.plot(x='log_population', y='medals', kind='scatter')

<AxesSubplot:xlabel='log_population', ylabel='medals'>

# Poisson negative log-likelhood
poisson_loglike = lambda beta, X, y: -(-np.exp(X.dot(beta)) + y*X.dot(beta)).sum()

b1, b0 = fmin(poisson_loglike, [0,1], args=(medals[['log_population']].assign(intercept=1).values, 
                                            medals.medals.values))
b0, b1

Optimization terminated successfully.
         Current function value: -1381.299433
         Iterations: 68
         Function evaluations: 131

(-5.297302917060439, 0.44873025169011005)

ax = medals.plot(x='log_population', y='medals', kind='scatter')
xvals = np.arange(12, 22)
ax.plot(xvals, np.exp(b0 + b1*xvals), 'r--')

[<matplotlib.lines.Line2D at 0x7fd5b8d999a0>]

	Unnamed: 0	crim	zn	indus	nox	rm	age	dis	rad	tax	ptratio	black	lstat	medv
0	1	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98	24.0
1	2	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14	21.6
2	3	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03	34.7
3	4	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94	33.4
4	5	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	5.33	36.2

	medals	population	oecd	log_population
0	1	96165	0	11.473821
1	1	281584	0	12.548186
2	6	2589043	0	14.766799
3	25	10952046	0	16.209037
4	41	18348078	1	16.725035

Big Data Essentials¶

L4: Statistical Modeling with Python¶

Basic python modules for Statistics¶

`NumPy`¶

Readings¶

`SciPy` ¶

Readings¶

`pandas`¶

Readings¶

Visualization¶

Readings¶

Working with Data¶

Read and write data in Python with `stdin` and `stdout`¶

Read from and write to files directly¶

Read a CSV file¶

Fitting linear regression models¶

Recall the linear regression model¶

Linear regression model¶

Polynomial regession¶

Generalized linear models¶

Linear regression models¶

Generalized linear models (GLM)¶

GLM assumptions¶

Poisson regression¶

More on modeling¶

Big Data Essentials¶

L4: Statistical Modeling with Python¶

Basic python modules for Statistics¶

NumPy¶

Readings¶

SciPy ¶

Readings¶

pandas¶

Readings¶

Visualization¶

Readings¶

Working with Data¶

Read and write data in Python with stdin and stdout¶

Read from and write to files directly¶

Read a CSV file¶

Fitting linear regression models¶

Recall the linear regression model¶

Linear regression model¶

Polynomial regession¶

Generalized linear models¶

Linear regression models¶

Generalized linear models (GLM)¶

GLM assumptions¶

Poisson regression¶

More on modeling¶

`NumPy`¶

`SciPy` ¶

`pandas`¶

Read and write data in Python with `stdin` and `stdout`¶