Wednesday, February 20, 2019

Python Pandas Library

What is Pandas
  • Pandas : Panel Data
  • Structure data as a virtual spreadsheet.
  • Visualize data to find issues, such as sparse dataset, missing dataset.
  • Pandas.pydata.org
  • Pandas Docs
  • Pandas Quick Overview
  • NumPy.org
  • SciPy.org
  • See PyPI for 3rd party libraries
  • >>> import pandas as pd
    >>> from pandas import DataFrame, Series
    >>> help (pd)
    
    Applications
  • For a recommendation system, create a rating matrix and predict missing ratings for each user
           based on existing reviews.
  • Classification
  • Value estimation
  • Basics
    Example : First Look at pandas
    import numpy as np
    import pandas as pd
    mysr = pd.Series ([2,4,6, np.non, 7,9])
    mydt = pd.date_range('20200202', periods = 6)     ## default frequency is daily
    mynp = np.array (np.arange(24)).reshape((6,4))
    mydf = pd.DataFrame (mynp, index = mydt, columns = list('PQST'))
    print ( mydf.head() )                             ## default print head 5 rows
    print ( mydf.tail(10) )                           ## default print tail 5 rows
    print ( mydf.values )
    print ( mydf.index )
    print ( mydf.columns )
    
    d.set_option ('display.percision', 2)
    # pandas option docs
    print ( mydf.describe() )                         ## a quick statistical summary, 
                                                      ## int will be viewed as float
    pd.set_option ('display.percision', 2)
    # pandas option docs
    
    # create data frame from python dictionary
    dcdf = pd.DataFrame ({ 
        'float' : 1., 
        'time' : pd.Timestamp ('20200808'),
        'series' : pd.Series (1, index = list (range(4)), dtype = 'float32' ),
        'array' : np.array ([3] * 4, dtype = 'int32' ),
        'categories' : pd.Categorical (['t1','t2','t3','t4']),
        'misc' : 'useless data'
        })
    print (dcdf.dtypes)
    print (dcdf.T)
    dcdf.sort_index( axis=1, ascending=False)          ## sort along the axis, default axis=0
    dcdf.sort_values (by='Q', ascending=False)
    
    Example : get json data and plot density charts
    from pandas import DataFrame, Series
    import pandas as pd
    import json
    from  collections import defaultdict
    
    def get_count(l=[]):
        res = defaultdict(int)
        for i in l:
            res[i] += 1
        return res
    
    def best_count(h={}, n=5):
        list = [ (v, k) for k, v in h.items() ]
        return list[-n:]
    
    # file.json contains list of dict, each has many fields
    data = [json.loads(line) for line in open('file.json')
    frame = DataFrame (data)
    print (frame)
    
    namelist = [item['name'] for item in data if 'name' in item]
    hcount = get_count(namelist)
    toplist = best_count(hcount)
    
    # using pandas
    ptoplist = frame['name'].value_counts()
    print( ptoplist[:5] )
    
    # cleaning dataset
    newlist = frame['name'].fillna('Missing name')       ## replace missing values
    newlist[newlist == ''] = 'Unknown'                   ## replace empty entries
    
    # Series
    results = Series ( [s.split()[0] for s in frame.key.dropna()] )
    print (results[:5])
    print (results.value_counts()[:8]
    newframe = frame [frame.key.notnull()]
    location = np.where( newframe['key'].str.contains('USA'), 'In USA', 'Out of USA' )
    by_location = newframe.groupby ( ['name', location] )
    groupcnt = by_location.size().unstack().fillna(0)
    print (groupcnt[:5])
    
    sortlist = groupcnt.sum(1).argsort()
    print (sortlist[:5])
    
    sublist = groupcnt.take( sortlist )[-12:]
    sublist.plot( kind='barh', stacked=True )
    normlist = sublist.div( sublist.sum(1), axis=0 )
    normlist.plot ( kind = 'barh', stacked=True )
    
    
    Example : view data from CSV file
    import pandas
    import webbrowser
    import os
    
    # "data_id" is the column header in the file
    data_table = pandas.read_csv("data_set.csv", index_col="data_id")
    html = data_table[0:100].to_html()
    with open("data.html", "w") as f:
        f.write(html)
     
    full_filename = os.path.abspath("data.html")
    webbrowser.open("file://{}".format(full_filename))
    
    Example : Using Pandas with NumPy
    import pandas as pd
    import numpy as np
    import webbrowser
    import os
    
    # "data_id" is the column header in the file
    data_table = pd.read_csv("data_set.csv")
    result = pd.pivot_table(data_table, index='user_id', columns='data_id', aggfunc=np.max)
    
    # write to a csv file
    result.to_csv("data.csv", na_rep="")
    
    # or generate a html and write out
    html = result[0:100].to_html(na_rep="")
    with open("data.html", "w") as f:
        f.write(html)
     
    full_filename = os.path.abspath("data.html")
    webbrowser.open("file://{}".format(full_filename))
    
    Example : Create a Recommendation matrix
    import numpy as np
    import pandas as pd
    import matrix_factorization_utilities
    
    raw_data = pd.read_csv('data.csv')
    ratings = pd.pivot_table(
              raw_data, index='user_id', columns='movie_id', aggfunc=np.max)
    
    # Apply matrix factorization to find the latent features
    x,y = matrix_factorization_utilities.low_rank_matrix_factorization(
                                                   ratings.as_matrix(),
                                                   num_features=15,
                                                   regularization_amount=0.1)
    
    # Calculate ratings by multiplying the x by y
    predicted_ratings = np.matmul(x, y)
    df = pd.DataFrame(index=ratings.index,
                      columns=ratings.columns,
                      data=predicted_ratings)
    df.to_csv("predicted_ratings.csv")
    

    No comments:

    Post a Comment