import pandas as pd
from collections import OrderedDict

scientists = pd.read_csv('./data/scientists.csv')

ages = scientists['Age']

print('max: {}'.format(ages.max()))
print("mean: {}".format(ages.mean()))

max: 90
mean: 59.125

boolean 추출¶

평균나이보다 나이가 많은 사람

모든 데이터에 대해 한 번에 연산하는 것을 Broadcasting이라 함

ages[ages > ages.mean()]

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

broadcasting¶

print(pd.Series([1, 100]))

print("\n=================================\n")
print(ages + pd.Series([1, 100]))

0      1
1    100
dtype: int64

=================================

0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64

rev_age = ages.sort_index(ascending=False)
print(rev_age) # index의 역순

7    77
6    41
5    45
4    56
3    66
2    90
1    61
0    37
Name: Age, dtype: int64

print(ages*2)
print("\n=================================\n")

print(ages + ages.sort_index(ascending=False))

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

=================================

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

ages * 2 와 ages+ages.sort_index(ascending=False)의 결과값이 동일 벡터와 벡터의 연산은 일치하는 인덱스끼리 값을 수행

from IPython.core.display import HTML, display
display(HTML("<style> .container{width:100% !important;}</style>"))

import pandas as pd
from collections import OrderedDict

scientists = pd.DataFrame(OrderedDict([
    ["Occupation", ["Chemist", "Statistician"]],
    ["Born", ["1920-07-25", "1876-06-13"]],
    ["Died", ["1958-04-16", "1937-10-16"]],
    ["Age", [37, 61]]
]), index=["Rosaline Franklin", "William Gosset"])

first_row = scientists.loc["William Gosset"]
print(first_row)

## Age열에 정수를 전달해도 series의 자료형을 오브젝트로 인식

Occupation    Statistician
Born            1876-06-13
Died            1937-10-16
Age                     61
Name: William Gosset, dtype: object

index, values, keys¶

print("index: \t{}".format(first_row.index))
print("values: \t{}".format(first_row.values))
print("keys: {}".format(first_row.keys()))

index: 	Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')
values: 	['Statistician' '1876-06-13' '1937-10-16' 61]
keys: Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')

# index 속성 응용
# index 속성의 첫번째 값 추출
print("{}".format(first_row.index[0]))

Occupation

# key method 
# key method 결과값을 이용하여 인덱스의 첫번째 값을 추출
print("{}".format(first_row.keys()[0]))

Occupation

statistic¶

ages = scientists["Age"]
print(ages)

Rosaline Franklin    37
William Gosset       61
Name: Age, dtype: int64

print("mean: {}".format(ages.mean()))
print("min: {}".format(ages.min()))
print("max: {}".format(ages.max()))
print("std: {}".format(ages.std()))

mean: 49.0
min: 37
max: 61
std: 16.97056274847714

series method¶

series	describe
append	2개 이상의 시리즈 연결
describe	요약 통계량 계산
drop_duplicates	중복값이 없는 시리즈
equals	시리즈에 해당 값을 요소가 있는지 확인
get_values	시리즈 값 구하기(values 속성과 동일)
isin	시리즈에 포합된 값이 있는지 확인
min	최소값
max	최댓값
mean	평균
median	중간값
replace	특정 값을 가진 시리즈 값을 교체
sample	임의의 값을 반환
sort_values	값을 정렬
to_frame	시리즈를 데이터프레임으로 변환

from IPython.core.display import HTML, display
display(HTML("<style> .container{width:100% !important;}</style>"))

import pandas as pd

s = pd.Series(["bonggu", 31])
print(s)

0    bonggu
1        31
dtype: object

Series 생성¶

s = pd.Series(["bong_gu", "chatterBOX"])
s = pd.Series(["bong_gu", "cahtterBOX"], index=["person", "who"])
print(s)

person       bong_gu
who       cahtterBOX
dtype: object

DataFrame 생성¶

dictionary

scientists = pd.DataFrame({
    'Name': ["Bong_gu", "Yang_o"],
    "Job" : ["buyer", "Data_analysis"],
    "Born": ["SEOUL", "DONGHAE"],
    "Age": [31, 31],
    "Hate": ["JaeHyeop","JAeHyeop"]
}, index=["Bong_gu", "Yang_o"], columns=["Name", "Job", "Born", "Hate", "Age"] )
scientists

## DataFrame에서 순서보장
from collections import OrderedDict

scientists = pd.DataFrame(OrderedDict([
    ["name", ["Rosaline Franklin", "William Gosset"]],
    ["Occupation", ["Chemist", "Statistician"]],
    ["Born", ["1920-07-25", "1876-06-13"]],
    ["Died", ["1958-04-16", "1937-10-16"]],
    ["Age", [37, 61]]
]))

print(scientists)

                name    Occupation        Born        Died  Age
0  Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
1     William Gosset  Statistician  1876-06-13  1937-10-16   61

from IPython.core.display import display, HTML

display(HTML("<style> .container{width:100% !important;}</style>"))

import pandas as pd

df = pd.read_csv("./data/gapminder.tsv", sep="\t")

print(df.head(n=10))

       country continent  year  lifeExp       pop   gdpPercap
0  Afghanistan      Asia  1952   28.801   8425333  779.445314
1  Afghanistan      Asia  1957   30.332   9240934  820.853030
2  Afghanistan      Asia  1962   31.997  10267083  853.100710
3  Afghanistan      Asia  1967   34.020  11537966  836.197138
4  Afghanistan      Asia  1972   36.088  13079460  739.981106
5  Afghanistan      Asia  1977   38.438  14880372  786.113360
6  Afghanistan      Asia  1982   39.854  12881816  978.011439
7  Afghanistan      Asia  1987   40.822  13867957  852.395945
8  Afghanistan      Asia  1992   41.674  16317921  649.341395
9  Afghanistan      Asia  1997   41.763  22227415  635.341351

groupby¶

print(df.groupby("year")["lifeExp"].mean())
print("\n======================================\n")
print(df.groupby("continent")["country"].nunique())
print("\n======================================\n")

year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

======================================

continent
Africa      52
Americas    25
Asia        33
Europe      30
Oceania      2
Name: country, dtype: int64

======================================

df.groupby(["year", "continent"])["lifeExp"].mean()

year  continent
1952  Africa       39.135500
      Americas     53.279840
      Asia         46.314394
      Europe       64.408500
      Oceania      69.255000
1957  Africa       41.266346
      Americas     55.960280
      Asia         49.318544
      Europe       66.703067
      Oceania      70.295000
1962  Africa       43.319442
      Americas     58.398760
      Asia         51.563223
      Europe       68.539233
      Oceania      71.085000
1967  Africa       45.334538
      Americas     60.410920
      Asia         54.663640
      Europe       69.737600
      Oceania      71.310000
1972  Africa       47.450942
      Americas     62.394920
      Asia         57.319269
      Europe       70.775033
      Oceania      71.910000
1977  Africa       49.580423
      Americas     64.391560
      Asia         59.610556
      Europe       71.937767
      Oceania      72.855000
1982  Africa       51.592865
      Americas     66.228840
      Asia         62.617939
      Europe       72.806400
      Oceania      74.290000
1987  Africa       53.344788
      Americas     68.090720
      Asia         64.851182
      Europe       73.642167
      Oceania      75.320000
1992  Africa       53.629577
      Americas     69.568360
      Asia         66.537212
      Europe       74.440100
      Oceania      76.945000
1997  Africa       53.598269
      Americas     71.150480
      Asia         68.020515
      Europe       75.505167
      Oceania      78.190000
2002  Africa       53.325231
      Americas     72.422040
      Asia         69.233879
      Europe       76.700600
      Oceania      79.740000
2007  Africa       54.806038
      Americas     73.608120
      Asia         70.728485
      Europe       77.648600
      Oceania      80.719500
Name: lifeExp, dtype: float64

graph¶

%matplotlib inline
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (12, 9)
plt.rcParams['axes.unicode_minus'] = False

lifeExp = df.groupby("year")["lifeExp"].mean()
lifeExp.plot()
plt.show()

from IPython.core.display import display, HTML

display(HTML("<style> .container{width:100% !important;}</style>"))

import pandas as pd

df = pd.read_csv("./data/gapminder.tsv", sep="\t")

데이터 확인하기¶

print(df.head()) # 앞부분
print("\n{}".format(type(df)))
print("\n{}".format(df.shape)) # shape
print("\n{}".format(df.columns)) # shape
print("\n{}\n".format(df.dtypes)) # columns type
print("{}".format(df.info())) # info

       country continent  year  lifeExp       pop   gdpPercap
0  Afghanistan      Asia  1952   28.801   8425333  779.445314
1  Afghanistan      Asia  1957   30.332   9240934  820.853030
2  Afghanistan      Asia  1962   31.997  10267083  853.100710
3  Afghanistan      Asia  1967   34.020  11537966  836.197138
4  Afghanistan      Asia  1972   36.088  13079460  739.981106

<class 'pandas.core.frame.DataFrame'>

(1704, 6)

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
country      1704 non-null object
continent    1704 non-null object
year         1704 non-null int64
lifeExp      1704 non-null float64
pop          1704 non-null int64
gdpPercap    1704 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB
None

데이터 추출¶

## columns 1개 추출
country_df = df["country"]
print(type(country_df))
print(country_df.head())
print("\n==============================\n")
print(country_df.tail())

<class 'pandas.core.series.Series'>
0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: country, dtype: object

==============================

1699    Zimbabwe
1700    Zimbabwe
1701    Zimbabwe
1702    Zimbabwe
1703    Zimbabwe
Name: country, dtype: object

columns 3개 추출 "country", "continent", "year"¶

type
head
tail

subset = df[["country", "continent", "year"]]

print(subset.dtypes) ## type
print("\n==============================\n")
print(subset.head()) ## head
print("\n==============================\n")
print(subset.tail()) ## tail

country      object
continent    object
year          int64
dtype: object

==============================

       country continent  year
0  Afghanistan      Asia  1952
1  Afghanistan      Asia  1957
2  Afghanistan      Asia  1962
3  Afghanistan      Asia  1967
4  Afghanistan      Asia  1972

==============================

       country continent  year
1699  Zimbabwe    Africa  1987
1700  Zimbabwe    Africa  1992
1701  Zimbabwe    Africa  1997
1702  Zimbabwe    Africa  2002
1703  Zimbabwe    Africa  2007

loc 속성으로 데이터 추출¶

인덱스가 음이 아닌 정수 사용
인덱스 vs 행번호

## 인덱스가 0, 99인 데이터 추출

print(df.loc[0])
print("\n==============================\n")
print(df.loc[99])

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap        779.445
Name: 0, dtype: object

==============================

country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap       721.186
Name: 99, dtype: object

## 마지막 데이터 추출, shape()이용
### shape는 정수로서 실제크기를 말하지만
### pandas의 인덱스는 0부터 시작하기때문에 1을 빼줘야함

last_row = df.shape[0]-1
print(df.loc[last_row])

print("\n==============================\n")
## 마지막 데이터 추출, tail()이용
### n 인자 사용
print(df.tail(n=1))

country      Zimbabwe
continent      Africa
year             2007
lifeExp        43.487
pop          12311143
gdpPercap     469.709
Name: 1703, dtype: object

==============================

       country continent  year  lifeExp       pop   gdpPercap
1703  Zimbabwe    Africa  2007   43.487  12311143  469.709298

### 인덱스가 0, 99, 999인 데이터를 추출
### list사용

print(df.loc[[0, 99, 999]])

         country continent  year  lifeExp       pop    gdpPercap
0    Afghanistan      Asia  1952   28.801   8425333   779.445314
99    Bangladesh      Asia  1967   43.453  62821884   721.186086
999     Mongolia      Asia  1967   51.253   1149500  1226.041130

iloc 속성으로 데이터 추출¶

loc는 데이터프레임의 인덱스를 사용하여 데이터를 추출
iloc는 행번호를 사용하여 데이터를 추출

print(df.iloc[1])
print("\n==============================\n")
print(df.iloc[99])

country      Afghanistan
continent           Asia
year                1957
lifeExp           30.332
pop              9240934
gdpPercap        820.853
Name: 1, dtype: object

==============================

country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap       721.186
Name: 99, dtype: object

### iloc에 -1은 행 데이터를 추출
print(df.iloc[-1])
print(df.iloc[1710])

country      Zimbabwe
continent      Africa
year             2007
lifeExp        43.487
pop          12311143
gdpPercap     469.709
Name: 1703, dtype: object


IndexErrorTraceback (most recent call last)
<ipython-input-10-8a802f6f3585> in <module>
      1 ### iloc에 -1은 행 데이터를 추출
      2 print(df.iloc[-1])
----> 3 print(df.iloc[1710])

/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1476 
   1477             maybe_callable = com._apply_if_callable(key, self.obj)
-> 1478             return self._getitem_axis(maybe_callable, axis=axis)
   1479 
   1480     def _is_scalar_access(self, key):

/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   2100 
   2101             # validate the location
-> 2102             self._validate_integer(key, axis)
   2103 
   2104             return self._get_loc(key, axis=axis)

/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _validate_integer(self, key, axis)
   2007         l = len(ax)
   2008         if key >= l or key < -l:
-> 2009             raise IndexError("single positional indexer is out-of-bounds")
   2010 
   2011     def _getitem_tuple(self, tup):

IndexError: single positional indexer is out-of-bounds

print(df.iloc[[0, 99, 999]])

         country continent  year  lifeExp       pop    gdpPercap
0    Afghanistan      Asia  1952   28.801   8425333   779.445314
99    Bangladesh      Asia  1967   43.453  62821884   721.186086
999     Mongolia      Asia  1967   51.253   1149500  1226.041130

### : -- 모든행
subset = df.loc[:, ["year", "pop"]]
print(subset.head())

subset = df.iloc[:, [2, 4, -1]]
print(subset.head())

   year       pop
0  1952   8425333
1  1957   9240934
2  1962  10267083
3  1967  11537966
4  1972  13079460
   year       pop   gdpPercap
0  1952   8425333  779.445314
1  1957   9240934  820.853030
2  1962  10267083  853.100710
3  1967  11537966  836.197138
4  1972  13079460  739.981106

small_range = range(5) ## 

subset = df.iloc[:, small_range]
print(subset.head())

print("\n==============================\n")
range2 = list(range(3, 6))
subset = df.iloc[:, range2]
print(subset.head())

       country continent  year  lifeExp       pop
0  Afghanistan      Asia  1952   28.801   8425333
1  Afghanistan      Asia  1957   30.332   9240934
2  Afghanistan      Asia  1962   31.997  10267083
3  Afghanistan      Asia  1967   34.020  11537966
4  Afghanistan      Asia  1972   36.088  13079460

==============================

   lifeExp       pop   gdpPercap
0   28.801   8425333  779.445314
1   30.332   9240934  820.853030
2   31.997  10267083  853.100710
3   34.020  11537966  836.197138
4   36.088  13079460  739.981106

range3 = list(range(0, 6, 2))
subset = df.iloc[:, range3]
print(subset.head())
print("\n==============================\n")
subset = df.iloc[:, 0:6:2]
print(subset.head())

       country  year       pop
0  Afghanistan  1952   8425333
1  Afghanistan  1957   9240934
2  Afghanistan  1962  10267083
3  Afghanistan  1967  11537966
4  Afghanistan  1972  13079460

==============================

       country  year       pop
0  Afghanistan  1952   8425333
1  Afghanistan  1957   9240934
2  Afghanistan  1962  10267083
3  Afghanistan  1967  11537966
4  Afghanistan  1972  13079460

### loc, iloc, 데이터셋이 클 수록 loc 속성이 유리
print(df.iloc[[0, 99, 999], [0, 3, 5]])
print(df.loc[[0, 99, 999], ["country", "lifeExp", "gdpPercap"]])

         country  lifeExp    gdpPercap
0    Afghanistan   28.801   779.445314
99    Bangladesh   43.453   721.186086
999     Mongolia   51.253  1226.041130
         country  lifeExp    gdpPercap
0    Afghanistan   28.801   779.445314
99    Bangladesh   43.453   721.186086
999     Mongolia   51.253  1226.041130

from IPython.core.display import display, HTML

display(HTML("<style> .container{width:100% !important;}</style>"))

07.handling_dataframe(bool-apply) (0)	2018.12.09
06.handling_dataframe(bool) (0)	2018.12.09
04.handling_series(basic) (0)	2018.12.09
03.create_data_frame (0)	2018.12.09
02.basic_statistic (0)	2018.12.09

06.handling_dataframe(bool) (0)	2018.12.09
05.handling_series(apply) (0)	2018.12.09
03.create_data_frame (0)	2018.12.09
02.basic_statistic (0)	2018.12.09
01.data_slicing (0)	2018.12.09

06.handling_dataframe(bool) (0)	2018.12.09
05.handling_series(apply) (0)	2018.12.09
04.handling_series(basic) (0)	2018.12.09
02.basic_statistic (0)	2018.12.09
01.data_slicing (0)	2018.12.09

06.handling_dataframe(bool) (0)	2018.12.09
05.handling_series(apply) (0)	2018.12.09
04.handling_series(basic) (0)	2018.12.09
03.create_data_frame (0)	2018.12.09
01.data_slicing (0)	2018.12.09

06.handling_dataframe(bool) (0)	2018.12.09
05.handling_series(apply) (0)	2018.12.09
04.handling_series(basic) (0)	2018.12.09
03.create_data_frame (0)	2018.12.09
02.basic_statistic (0)	2018.12.09

	Name	Job	Born	Hate	Age
Bong_gu	Bong_gu	buyer	SEOUL	JaeHyeop	31
Yang_o	Yang_o	Data_analysis	DONGHAE	JAeHyeop	31

Elastic Net Regression (0)	2018.04.29
LASSO and Ridge Regression (1)	2018.04.27
Deming Regression (0)	2018.04.27
Loss Function in Linear Regressions (0)	2018.04.26
TensorFlow Way of LinearRegression (0)	2018.04.26

Logistic Regression (0)	2018.05.01
LASSO and Ridge Regression (1)	2018.04.27
Deming Regression (0)	2018.04.27
Loss Function in Linear Regressions (0)	2018.04.26
TensorFlow Way of LinearRegression (0)	2018.04.26

parallelize (0)	2018.06.06
lapply (0)	2018.06.06

data_frame, apply (0)	2018.06.07
lapply (0)	2018.06.06

data_frame, apply (0)	2018.06.07
parallelize (0)	2018.06.06

분류 전체보기

boolean 추출¶

broadcasting¶

'pandas > basic' 카테고리의 다른 글

index, values, keys¶

statistic¶

series method¶

'pandas > basic' 카테고리의 다른 글

Series 생성¶

DataFrame 생성¶

'pandas > basic' 카테고리의 다른 글

groupby¶

graph¶

'pandas > basic' 카테고리의 다른 글

데이터 확인하기¶

데이터 추출¶

columns 3개 추출 "country", "continent", "year"¶

loc 속성으로 데이터 추출¶

iloc 속성으로 데이터 추출¶

'pandas > basic' 카테고리의 다른 글

matrix와 data.frame의 조작

matrix와 array

'R > functional' 카테고리의 다른 글

parallelize

'R > functional' 카테고리의 다른 글

1000개의 무작위 균일 난수를 입력으로 제공받은 함수를 호출

data.frame에 lapply 적용하기

for문 사용

lapply로 구현하기

'R > functional' 카테고리의 다른 글

'Tensorflow > Support Vector Machine' 카테고리의 다른 글

'Tensorflow > Support Vector Machine' 카테고리의 다른 글

'Tensorflow > Linear Regression' 카테고리의 다른 글

'Tensorflow > Linear Regression' 카테고리의 다른 글

'Tensorflow > Linear Regression' 카테고리의 다른 글

'Tensorflow > Linear Regression' 카테고리의 다른 글

'Tensorflow > Linear Regression' 카테고리의 다른 글

티스토리툴바

`matrix`와 `data.frame`의 조작