Ann's Code Guidebook: Python Pandas Library

What is Pandas

Pandas : Panel Data

Structure data as a virtual spreadsheet.

Visualize data to find issues, such as sparse dataset, missing dataset.

Pandas.pydata.org

Pandas Docs

Pandas Quick Overview

NumPy.org

SciPy.org

See PyPI for 3rd party libraries

>>> import pandas as pd
>>> from pandas import DataFrame, Series
>>> help (pd)

Applications

For a recommendation system, create a rating matrix and predict missing ratings for each user
based on existing reviews.

Classification

Value estimation

Basics

Series

DataFrame

datatime index

Example : First Look at pandas

import numpy as np
import pandas as pd
mysr = pd.Series ([2,4,6, np.non, 7,9])
mydt = pd.date_range('20200202', periods = 6)     ## default frequency is daily
mynp = np.array (np.arange(24)).reshape((6,4))
mydf = pd.DataFrame (mynp, index = mydt, columns = list('PQST'))
print ( mydf.head() )                             ## default print head 5 rows
print ( mydf.tail(10) )                           ## default print tail 5 rows
print ( mydf.values )
print ( mydf.index )
print ( mydf.columns )

d.set_option ('display.percision', 2)
# pandas option docs
print ( mydf.describe() )                         ## a quick statistical summary, 
                                                  ## int will be viewed as float
pd.set_option ('display.percision', 2)
# pandas option docs

# create data frame from python dictionary
dcdf = pd.DataFrame ({ 
    'float' : 1., 
    'time' : pd.Timestamp ('20200808'),
    'series' : pd.Series (1, index = list (range(4)), dtype = 'float32' ),
    'array' : np.array ([3] * 4, dtype = 'int32' ),
    'categories' : pd.Categorical (['t1','t2','t3','t4']),
    'misc' : 'useless data'
    })
print (dcdf.dtypes)
print (dcdf.T)
dcdf.sort_index( axis=1, ascending=False)          ## sort along the axis, default axis=0
dcdf.sort_values (by='Q', ascending=False)

Example : get json data and plot density charts

from pandas import DataFrame, Series
import pandas as pd
import json
from  collections import defaultdict

def get_count(l=[]):
    res = defaultdict(int)
    for i in l:
        res[i] += 1
    return res

def best_count(h={}, n=5):
    list = [ (v, k) for k, v in h.items() ]
    return list[-n:]

# file.json contains list of dict, each has many fields
data = [json.loads(line) for line in open('file.json')
frame = DataFrame (data)
print (frame)

namelist = [item['name'] for item in data if 'name' in item]
hcount = get_count(namelist)
toplist = best_count(hcount)

# using pandas
ptoplist = frame['name'].value_counts()
print( ptoplist[:5] )

# cleaning dataset
newlist = frame['name'].fillna('Missing name')       ## replace missing values
newlist[newlist == ''] = 'Unknown'                   ## replace empty entries

# Series
results = Series ( [s.split()[0] for s in frame.key.dropna()] )
print (results[:5])
print (results.value_counts()[:8]
newframe = frame [frame.key.notnull()]
location = np.where( newframe['key'].str.contains('USA'), 'In USA', 'Out of USA' )
by_location = newframe.groupby ( ['name', location] )
groupcnt = by_location.size().unstack().fillna(0)
print (groupcnt[:5])

sortlist = groupcnt.sum(1).argsort()
print (sortlist[:5])

sublist = groupcnt.take( sortlist )[-12:]
sublist.plot( kind='barh', stacked=True )
normlist = sublist.div( sublist.sum(1), axis=0 )
normlist.plot ( kind = 'barh', stacked=True )

Example : view data from CSV file

import pandas
import webbrowser
import os

# "data_id" is the column header in the file
data_table = pandas.read_csv("data_set.csv", index_col="data_id")
html = data_table[0:100].to_html()
with open("data.html", "w") as f:
    f.write(html)
 
full_filename = os.path.abspath("data.html")
webbrowser.open("file://{}".format(full_filename))

Example : Using Pandas with NumPy

import pandas as pd
import numpy as np
import webbrowser
import os

# "data_id" is the column header in the file
data_table = pd.read_csv("data_set.csv")
result = pd.pivot_table(data_table, index='user_id', columns='data_id', aggfunc=np.max)

# write to a csv file
result.to_csv("data.csv", na_rep="")

# or generate a html and write out
html = result[0:100].to_html(na_rep="")
with open("data.html", "w") as f:
    f.write(html)
 
full_filename = os.path.abspath("data.html")
webbrowser.open("file://{}".format(full_filename))

Example : Create a Recommendation matrix

import numpy as np
import pandas as pd
import matrix_factorization_utilities

raw_data = pd.read_csv('data.csv')
ratings = pd.pivot_table(
          raw_data, index='user_id', columns='movie_id', aggfunc=np.max)

# Apply matrix factorization to find the latent features
x,y = matrix_factorization_utilities.low_rank_matrix_factorization(
                                               ratings.as_matrix(),
                                               num_features=15,
                                               regularization_amount=0.1)

# Calculate ratings by multiplying the x by y
predicted_ratings = np.matmul(x, y)
df = pd.DataFrame(index=ratings.index,
                  columns=ratings.columns,
                  data=predicted_ratings)
df.to_csv("predicted_ratings.csv")

Ann's Code Guidebook

Pages

Wednesday, February 20, 2019

Python Pandas Library

No comments:

Post a Comment

Disclaimer