from pandas import Series, DataFrame
d = {'name':Series(['Braud','Cummings','Heikkinen','Allen'], index = ['a', 'b', 'c', 'd']),\
    'age': Series([22, 38, 26, 35], index = ['a', 'b', 'c', 'd']),\
    'fare': Series([7.25, 71.83, 8.05], index = ['a', 'b', 'd']),\
    'survived?': Series([False, True, True, False], index = ['a', 'b', 'c', 'd'])}

df = DataFrame(d)

df['name']
Out[12]: 
a        Braud
b     Cummings
c    Heikkinen
d        Allen
Name: name, dtype: object

df.loc['a']
Out[14]: 
age             22
fare          7.25
name         Braud
survived?    False
Name: a, dtype: object

df[df['age'] >= 30]
Out[15]: 
   age   fare      name  survived?
b   38  71.83  Cummings       True
d   35   8.05     Allen      False

d2 = {'one': Series([1,2,3], index = ['a', 'b', 'c']),\
    'two': Series([1,2,3,4], index = ['a', 'b', 'c', 'd'])}
df2 = DataFrame(d2)
df2.apply(numpy.mean)

Out[17]: 
one    2.0
two    2.5
dtype: float64

df2['one'].map(lambda x: x>1)
Out[18]: 
a    False
b     True
c     True
d    False
Name: one, dtype: bool

df2.applymap(lambda x: x>1)
Out[19]: 
     one    two
a  False  False
b   True   True
c   True   True
d  False   True

Difference between Python and numpy(Numerical Python)

python2 使用 & |
python3 使用and or

import pandas as pd
countries = [

    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',

    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',

    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',

    'Belize', 'Benin', 'Bhutan', 'Bolivia',

    'Bosnia and Herzegovina'

]

employment_values = [

    55.70000076,  51.40000153,  50.5       ,  75.69999695,

    58.40000153,  40.09999847,  61.5       ,  57.09999847,

    60.90000153,  66.59999847,  60.40000153,  68.09999847,

    66.90000153,  53.40000153,  48.59999847,  56.79999924,

    71.59999847,  58.40000153,  70.40000153,  41.20000076

]

# Employment data in 2007 for 20 countries

employment = pd.Series(employment_values, index=countries)

def max_employment(employment):

    '''

    Fill in this function to return the name of the country

    with the highest employment in the given employment

    data, and the employment in that country.

    

    The input will be a Pandas series where the values

    are employment and the index is country names.

    

    Try using the Pandas argmax() function. Documention is

    here: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.argmax.html

    '''

    max_country_index = employment.argmax(axis = employment_values)

    max_country = max_country_index      # Replace this with your code

    max_value = employment[max_country_index]   # Replace this with your code

    print max_country, max_value

max_employment(employment)

Angola 75.69999695

pandas dataframe exercise

# Subway ridership for 5 stations on 10 different days
ridership_df = pd.DataFrame(
    data=[[   0,    0,    2,    5,    0],
          [1478, 3877, 3674, 2328, 2539],
          [1613, 4088, 3991, 6461, 2691],
          [1560, 3392, 3826, 4787, 2613],
          [1608, 4802, 3932, 4477, 2705],
          [1576, 3933, 3909, 4979, 2685],
          [  95,  229,  255,  496,  201],
          [   2,    0,    1,   27,    0],
          [1438, 3785, 3589, 4174, 2215],
          [1342, 4043, 4009, 4665, 3033]],
    index=['05-01-11', '05-02-11', '05-03-11', '05-04-11', '05-05-11',
           '05-06-11', '05-07-11', '05-08-11', '05-09-11', '05-10-11'],
    columns=['R003', 'R004', 'R005', 'R006', 'R007']
)

# Change False to True for each block of code to see what it does

# DataFrame creation
if False:
    # You can create a DataFrame out of a dictionary mapping column names to values
    df_1 = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
    print df_1

    # You can also use a list of lists or a 2D NumPy array
    df_2 = pd.DataFrame([[0, 1, 2], [3, 4, 5]], columns=['A', 'B', 'C'])
    print df_2
   

# Accessing elements
if False:
    print ridership_df.iloc[0]
    print ridership_df.loc['05-05-11']
    print ridership_df['R003']
    print ridership_df.iloc[1, 3]
    
# Accessing multiple rows
if False:
    print ridership_df.iloc[1:4]
    
# Accessing multiple columns
if False:
    print ridership_df[['R003', 'R005']]
    
# Pandas axis
if False:
    df = pd.DataFrame({'A': [0, 1, 2], 'B': [3, 4, 5]})
    print df.sum()
    print df.sum(axis=1)
    print df.values.sum()
    
def mean_riders_for_max_station(ridership):
    '''
    Fill in this function to find the station with the maximum riders on the
    first day, then return the mean riders per day for that station. Also
    return the mean ridership overall for comparsion.
    
    This is the same as a previous exercise, but this time the
    input is a Pandas DataFrame rather than a 2D NumPy array.
    '''
    overall_mean = None # Replace this with your code
    mean_for_max = None # Replace this with your code
    
    return (overall_mean, mean_for_max)

Summing up, apply works on a row / column basis of a DataFrame, applymap works element-wise on a DataFrame, and map works element-wise on a Series.

panda udacity notes

Difference between Python and numpy(Numerical Python)

pandas dataframe exercise

你可能感兴趣的:(panda udacity notes)