What is Pandas
>>> import pandas as pd >>> from pandas import DataFrame, Series >>> help (pd)
Applications
based on existing reviews.
Basics
Example : First Look at pandas
import numpy as np
import pandas as pd
mysr = pd.Series ([2,4,6, np.non, 7,9])
mydt = pd.date_range('20200202', periods = 6) ## default frequency is daily
mynp = np.array (np.arange(24)).reshape((6,4))
mydf = pd.DataFrame (mynp, index = mydt, columns = list('PQST'))
print ( mydf.head() ) ## default print head 5 rows
print ( mydf.tail(10) ) ## default print tail 5 rows
print ( mydf.values )
print ( mydf.index )
print ( mydf.columns )
d.set_option ('display.percision', 2)
# pandas option docs
print ( mydf.describe() ) ## a quick statistical summary,
## int will be viewed as float
pd.set_option ('display.percision', 2)
# pandas option docs
# create data frame from python dictionary
dcdf = pd.DataFrame ({
'float' : 1.,
'time' : pd.Timestamp ('20200808'),
'series' : pd.Series (1, index = list (range(4)), dtype = 'float32' ),
'array' : np.array ([3] * 4, dtype = 'int32' ),
'categories' : pd.Categorical (['t1','t2','t3','t4']),
'misc' : 'useless data'
})
print (dcdf.dtypes)
print (dcdf.T)
dcdf.sort_index( axis=1, ascending=False) ## sort along the axis, default axis=0
dcdf.sort_values (by='Q', ascending=False)
Example : get json data and plot density charts
from pandas import DataFrame, Series
import pandas as pd
import json
from collections import defaultdict
def get_count(l=[]):
res = defaultdict(int)
for i in l:
res[i] += 1
return res
def best_count(h={}, n=5):
list = [ (v, k) for k, v in h.items() ]
return list[-n:]
# file.json contains list of dict, each has many fields
data = [json.loads(line) for line in open('file.json')
frame = DataFrame (data)
print (frame)
namelist = [item['name'] for item in data if 'name' in item]
hcount = get_count(namelist)
toplist = best_count(hcount)
# using pandas
ptoplist = frame['name'].value_counts()
print( ptoplist[:5] )
# cleaning dataset
newlist = frame['name'].fillna('Missing name') ## replace missing values
newlist[newlist == ''] = 'Unknown' ## replace empty entries
# Series
results = Series ( [s.split()[0] for s in frame.key.dropna()] )
print (results[:5])
print (results.value_counts()[:8]
newframe = frame [frame.key.notnull()]
location = np.where( newframe['key'].str.contains('USA'), 'In USA', 'Out of USA' )
by_location = newframe.groupby ( ['name', location] )
groupcnt = by_location.size().unstack().fillna(0)
print (groupcnt[:5])
sortlist = groupcnt.sum(1).argsort()
print (sortlist[:5])
sublist = groupcnt.take( sortlist )[-12:]
sublist.plot( kind='barh', stacked=True )
normlist = sublist.div( sublist.sum(1), axis=0 )
normlist.plot ( kind = 'barh', stacked=True )
Example : view data from CSV file
import pandas
import webbrowser
import os
# "data_id" is the column header in the file
data_table = pandas.read_csv("data_set.csv", index_col="data_id")
html = data_table[0:100].to_html()
with open("data.html", "w") as f:
f.write(html)
full_filename = os.path.abspath("data.html")
webbrowser.open("file://{}".format(full_filename))
Example : Using Pandas with NumPy
import pandas as pd
import numpy as np
import webbrowser
import os
# "data_id" is the column header in the file
data_table = pd.read_csv("data_set.csv")
result = pd.pivot_table(data_table, index='user_id', columns='data_id', aggfunc=np.max)
# write to a csv file
result.to_csv("data.csv", na_rep="")
# or generate a html and write out
html = result[0:100].to_html(na_rep="")
with open("data.html", "w") as f:
f.write(html)
full_filename = os.path.abspath("data.html")
webbrowser.open("file://{}".format(full_filename))
Example : Create a Recommendation matrix
import numpy as np
import pandas as pd
import matrix_factorization_utilities
raw_data = pd.read_csv('data.csv')
ratings = pd.pivot_table(
raw_data, index='user_id', columns='movie_id', aggfunc=np.max)
# Apply matrix factorization to find the latent features
x,y = matrix_factorization_utilities.low_rank_matrix_factorization(
ratings.as_matrix(),
num_features=15,
regularization_amount=0.1)
# Calculate ratings by multiplying the x by y
predicted_ratings = np.matmul(x, y)
df = pd.DataFrame(index=ratings.index,
columns=ratings.columns,
data=predicted_ratings)
df.to_csv("predicted_ratings.csv")
No comments:
Post a Comment