import pandas as pd

df = pd.read_csv("./data/gapminder.tsv", sep="\t")

데이터 확인하기¶

print(df.head()) # 앞부분
print("\n{}".format(type(df)))
print("\n{}".format(df.shape)) # shape
print("\n{}".format(df.columns)) # shape
print("\n{}\n".format(df.dtypes)) # columns type
print("{}".format(df.info())) # info

       country continent  year  lifeExp       pop   gdpPercap
0  Afghanistan      Asia  1952   28.801   8425333  779.445314
1  Afghanistan      Asia  1957   30.332   9240934  820.853030
2  Afghanistan      Asia  1962   31.997  10267083  853.100710
3  Afghanistan      Asia  1967   34.020  11537966  836.197138
4  Afghanistan      Asia  1972   36.088  13079460  739.981106

<class 'pandas.core.frame.DataFrame'>

(1704, 6)

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
country      1704 non-null object
continent    1704 non-null object
year         1704 non-null int64
lifeExp      1704 non-null float64
pop          1704 non-null int64
gdpPercap    1704 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB
None

데이터 추출¶

## columns 1개 추출
country_df = df["country"]
print(type(country_df))
print(country_df.head())
print("\n==============================\n")
print(country_df.tail())

<class 'pandas.core.series.Series'>
0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: country, dtype: object

==============================

1699    Zimbabwe
1700    Zimbabwe
1701    Zimbabwe
1702    Zimbabwe
1703    Zimbabwe
Name: country, dtype: object

columns 3개 추출 "country", "continent", "year"¶

type
head
tail

subset = df[["country", "continent", "year"]]

print(subset.dtypes) ## type
print("\n==============================\n")
print(subset.head()) ## head
print("\n==============================\n")
print(subset.tail()) ## tail

country      object
continent    object
year          int64
dtype: object

==============================

       country continent  year
0  Afghanistan      Asia  1952
1  Afghanistan      Asia  1957
2  Afghanistan      Asia  1962
3  Afghanistan      Asia  1967
4  Afghanistan      Asia  1972

==============================

       country continent  year
1699  Zimbabwe    Africa  1987
1700  Zimbabwe    Africa  1992
1701  Zimbabwe    Africa  1997
1702  Zimbabwe    Africa  2002
1703  Zimbabwe    Africa  2007

loc 속성으로 데이터 추출¶

인덱스가 음이 아닌 정수 사용
인덱스 vs 행번호

## 인덱스가 0, 99인 데이터 추출

print(df.loc[0])
print("\n==============================\n")
print(df.loc[99])

country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap        779.445
Name: 0, dtype: object

==============================

country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap       721.186
Name: 99, dtype: object

## 마지막 데이터 추출, shape()이용
### shape는 정수로서 실제크기를 말하지만
### pandas의 인덱스는 0부터 시작하기때문에 1을 빼줘야함

last_row = df.shape[0]-1
print(df.loc[last_row])

print("\n==============================\n")
## 마지막 데이터 추출, tail()이용
### n 인자 사용
print(df.tail(n=1))

country      Zimbabwe
continent      Africa
year             2007
lifeExp        43.487
pop          12311143
gdpPercap     469.709
Name: 1703, dtype: object

==============================

       country continent  year  lifeExp       pop   gdpPercap
1703  Zimbabwe    Africa  2007   43.487  12311143  469.709298

### 인덱스가 0, 99, 999인 데이터를 추출
### list사용

print(df.loc[[0, 99, 999]])

         country continent  year  lifeExp       pop    gdpPercap
0    Afghanistan      Asia  1952   28.801   8425333   779.445314
99    Bangladesh      Asia  1967   43.453  62821884   721.186086
999     Mongolia      Asia  1967   51.253   1149500  1226.041130

iloc 속성으로 데이터 추출¶

loc는 데이터프레임의 인덱스를 사용하여 데이터를 추출
iloc는 행번호를 사용하여 데이터를 추출

print(df.iloc[1])
print("\n==============================\n")
print(df.iloc[99])

country      Afghanistan
continent           Asia
year                1957
lifeExp           30.332
pop              9240934
gdpPercap        820.853
Name: 1, dtype: object

==============================

country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap       721.186
Name: 99, dtype: object

### iloc에 -1은 행 데이터를 추출
print(df.iloc[-1])
print(df.iloc[1710])

country      Zimbabwe
continent      Africa
year             2007
lifeExp        43.487
pop          12311143
gdpPercap     469.709
Name: 1703, dtype: object


IndexErrorTraceback (most recent call last)
<ipython-input-10-8a802f6f3585> in <module>
      1 ### iloc에 -1은 행 데이터를 추출
      2 print(df.iloc[-1])
----> 3 print(df.iloc[1710])

/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1476 
   1477             maybe_callable = com._apply_if_callable(key, self.obj)
-> 1478             return self._getitem_axis(maybe_callable, axis=axis)
   1479 
   1480     def _is_scalar_access(self, key):

/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   2100 
   2101             # validate the location
-> 2102             self._validate_integer(key, axis)
   2103 
   2104             return self._get_loc(key, axis=axis)

/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _validate_integer(self, key, axis)
   2007         l = len(ax)
   2008         if key >= l or key < -l:
-> 2009             raise IndexError("single positional indexer is out-of-bounds")
   2010 
   2011     def _getitem_tuple(self, tup):

IndexError: single positional indexer is out-of-bounds

print(df.iloc[[0, 99, 999]])

         country continent  year  lifeExp       pop    gdpPercap
0    Afghanistan      Asia  1952   28.801   8425333   779.445314
99    Bangladesh      Asia  1967   43.453  62821884   721.186086
999     Mongolia      Asia  1967   51.253   1149500  1226.041130

### : -- 모든행
subset = df.loc[:, ["year", "pop"]]
print(subset.head())

subset = df.iloc[:, [2, 4, -1]]
print(subset.head())

   year       pop
0  1952   8425333
1  1957   9240934
2  1962  10267083
3  1967  11537966
4  1972  13079460
   year       pop   gdpPercap
0  1952   8425333  779.445314
1  1957   9240934  820.853030
2  1962  10267083  853.100710
3  1967  11537966  836.197138
4  1972  13079460  739.981106

small_range = range(5) ## 

subset = df.iloc[:, small_range]
print(subset.head())

print("\n==============================\n")
range2 = list(range(3, 6))
subset = df.iloc[:, range2]
print(subset.head())

       country continent  year  lifeExp       pop
0  Afghanistan      Asia  1952   28.801   8425333
1  Afghanistan      Asia  1957   30.332   9240934
2  Afghanistan      Asia  1962   31.997  10267083
3  Afghanistan      Asia  1967   34.020  11537966
4  Afghanistan      Asia  1972   36.088  13079460

==============================

   lifeExp       pop   gdpPercap
0   28.801   8425333  779.445314
1   30.332   9240934  820.853030
2   31.997  10267083  853.100710
3   34.020  11537966  836.197138
4   36.088  13079460  739.981106

range3 = list(range(0, 6, 2))
subset = df.iloc[:, range3]
print(subset.head())
print("\n==============================\n")
subset = df.iloc[:, 0:6:2]
print(subset.head())

       country  year       pop
0  Afghanistan  1952   8425333
1  Afghanistan  1957   9240934
2  Afghanistan  1962  10267083
3  Afghanistan  1967  11537966
4  Afghanistan  1972  13079460

==============================

       country  year       pop
0  Afghanistan  1952   8425333
1  Afghanistan  1957   9240934
2  Afghanistan  1962  10267083
3  Afghanistan  1967  11537966
4  Afghanistan  1972  13079460

### loc, iloc, 데이터셋이 클 수록 loc 속성이 유리
print(df.iloc[[0, 99, 999], [0, 3, 5]])
print(df.loc[[0, 99, 999], ["country", "lifeExp", "gdpPercap"]])

         country  lifeExp    gdpPercap
0    Afghanistan   28.801   779.445314
99    Bangladesh   43.453   721.186086
999     Mongolia   51.253  1226.041130
         country  lifeExp    gdpPercap
0    Afghanistan   28.801   779.445314
99    Bangladesh   43.453   721.186086
999     Mongolia   51.253  1226.041130

from IPython.core.display import display, HTML

display(HTML("<style> .container{width:100% !important;}</style>"))

06.handling_dataframe(bool) (0)	2018.12.09
05.handling_series(apply) (0)	2018.12.09
04.handling_series(basic) (0)	2018.12.09
03.create_data_frame (0)	2018.12.09
02.basic_statistic (0)	2018.12.09

게으른 우루루

01.data_slicing

데이터 확인하기¶

데이터 추출¶

columns 3개 추출 "country", "continent", "year"¶

loc 속성으로 데이터 추출¶

iloc 속성으로 데이터 추출¶

'pandas > basic' 카테고리의 다른 글

+ Recent posts

티스토리툴바