07_handling_dataframe(bool-apply)
In [1]:
import pandas as pd
from collections import OrderedDict
In [2]:
scientists = pd.read_csv("./data/scientists.csv")


열의 자료형 바꾸기와 새로운 열 추가

In [3]:
print("Born type: {}".format(scientists["Born"].dtype))
print("Died type: {}".format(scientists["Died"].dtype))
Born type: object
Died type: object
In [4]:
born_datetime = pd.to_datetime(scientists["Born"], format="%Y-%m-%d")
print("born_datetime: \n{}".format(born_datetime))

print("\n===================================\n")
died_datetime = pd.to_datetime(scientists["Died"], format="%Y-%m-%d")
print("died_datetime: \n{}".format(died_datetime))
born_datetime: 
0   1920-07-25
1   1876-06-13
2   1820-05-12
3   1867-11-07
4   1907-05-27
5   1813-03-15
6   1912-06-23
7   1777-04-30
Name: Born, dtype: datetime64[ns]

===================================

died_datetime: 
0   1958-04-16
1   1937-10-16
2   1910-08-13
3   1934-07-04
4   1964-04-14
5   1858-06-16
6   1954-06-07
7   1855-02-23
Name: Died, dtype: datetime64[ns]
In [5]:
scientists["born_dt"], scientists["died_dt"] = [born_datetime, died_datetime]
scientists.head()
Out[5]:
Name Born Died Age Occupation born_dt died_dt
0 Rosaline Franklin 1920-07-25 1958-04-16 37 Chemist 1920-07-25 1958-04-16
1 William Gosset 1876-06-13 1937-10-16 61 Statistician 1876-06-13 1937-10-16
2 Florence Nightingale 1820-05-12 1910-08-13 90 Nurse 1820-05-12 1910-08-13
3 Marie Curie 1867-11-07 1934-07-04 66 Chemist 1867-11-07 1934-07-04
4 Rachel Carson 1907-05-27 1964-04-14 56 Biologist 1907-05-27 1964-04-14
In [6]:
scientists["age_days"] = (scientists["died_dt"] - scientists["born_dt"])
scientists.head()
Out[6]:
Name Born Died Age Occupation born_dt died_dt age_days
0 Rosaline Franklin 1920-07-25 1958-04-16 37 Chemist 1920-07-25 1958-04-16 13779 days
1 William Gosset 1876-06-13 1937-10-16 61 Statistician 1876-06-13 1937-10-16 22404 days
2 Florence Nightingale 1820-05-12 1910-08-13 90 Nurse 1820-05-12 1910-08-13 32964 days
3 Marie Curie 1867-11-07 1934-07-04 66 Chemist 1867-11-07 1934-07-04 24345 days
4 Rachel Carson 1907-05-27 1964-04-14 56 Biologist 1907-05-27 1964-04-14 20777 days
In [7]:
import random

print(scientists["Age"])
print("\n===================================\n")
random.seed(42)
random.shuffle(scientists["Age"])
print(scientists["Age"])
0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

===================================

/anaconda3/lib/python3.6/random.py:277: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  x[i], x[j] = x[j], x[i]
0    66
1    56
2    41
3    77
4    90
5    45
6    37
7    61
Name: Age, dtype: int64


열 삭제

In [8]:
print(scientists.columns)

print("\n===================================\n")

scientists_dropped = scientists.drop(["Age"], axis=1)
print(scientists_dropped.columns)
Index(['Name', 'Born', 'Died', 'Age', 'Occupation', 'born_dt', 'died_dt',
       'age_days'],
      dtype='object')

===================================

Index(['Name', 'Born', 'Died', 'Occupation', 'born_dt', 'died_dt', 'age_days'], dtype='object')
In [9]:
from IPython.core.display import display, HTML

display(HTML("<style> .container{width:100% !important;}</style>"))

'pandas > basic' 카테고리의 다른 글

06.handling_dataframe(bool)  (0) 2018.12.09
05.handling_series(apply)  (0) 2018.12.09
04.handling_series(basic)  (0) 2018.12.09
03.create_data_frame  (0) 2018.12.09
02.basic_statistic  (0) 2018.12.09
06_handling_dataframe(bool)
In [1]:
import pandas as pd
from collections import OrderedDict
In [2]:
scientists = pd.read_csv("./data/scientists.csv")


boolean 추출과 broadcasting

In [3]:
bool_idx = scientists["Age"] > scientists["Age"].mean()
scientists[bool_idx]
Out[3]:
Name Born Died Age Occupation
1 William Gosset 1876-06-13 1937-10-16 61 Statistician
2 Florence Nightingale 1820-05-12 1910-08-13 90 Nurse
3 Marie Curie 1867-11-07 1934-07-04 66 Chemist
7 Johann Gauss 1777-04-30 1855-02-23 77 Mathematician
In [4]:
## 문자열 2배, 값2배 -- broadcasting
print(scientists*2)
                                       Name                  Born  \
0        Rosaline FranklinRosaline Franklin  1920-07-251920-07-25   
1              William GossetWilliam Gosset  1876-06-131876-06-13   
2  Florence NightingaleFlorence Nightingale  1820-05-121820-05-12   
3                    Marie CurieMarie Curie  1867-11-071867-11-07   
4                Rachel CarsonRachel Carson  1907-05-271907-05-27   
5                        John SnowJohn Snow  1813-03-151813-03-15   
6                    Alan TuringAlan Turing  1912-06-231912-06-23   
7                  Johann GaussJohann Gauss  1777-04-301777-04-30   

                   Died  Age                            Occupation  
0  1958-04-161958-04-16   74                        ChemistChemist  
1  1937-10-161937-10-16  122              StatisticianStatistician  
2  1910-08-131910-08-13  180                            NurseNurse  
3  1934-07-041934-07-04  132                        ChemistChemist  
4  1964-04-141964-04-14  112                    BiologistBiologist  
5  1858-06-161858-06-16   90                    PhysicianPhysician  
6  1954-06-071954-06-07   82  Computer ScientistComputer Scientist  
7  1855-02-231855-02-23  154            MathematicianMathematician  
In [5]:
from IPython.core.display import display, HTML
display(HTML("<style> .container{width:100% !important;}</style>"))

'pandas > basic' 카테고리의 다른 글

07.handling_dataframe(bool-apply)  (0) 2018.12.09
05.handling_series(apply)  (0) 2018.12.09
04.handling_series(basic)  (0) 2018.12.09
03.create_data_frame  (0) 2018.12.09
02.basic_statistic  (0) 2018.12.09
05_handling_series(apply)
In [1]:
import pandas as pd
from collections import OrderedDict

scientists = pd.read_csv('./data/scientists.csv')
In [2]:
ages = scientists['Age']

print('max: {}'.format(ages.max()))
print("mean: {}".format(ages.mean()))
max: 90
mean: 59.125


boolean 추출

  • 평균나이보다 나이가 많은 사람

모든 데이터에 대해 한 번에 연산하는 것을 Broadcasting이라 함

In [3]:
ages[ages > ages.mean()]
Out[3]:
1    61
2    90
3    66
7    77
Name: Age, dtype: int64


broadcasting

In [4]:
print(pd.Series([1, 100]))

print("\n=================================\n")
print(ages + pd.Series([1, 100]))
0      1
1    100
dtype: int64

=================================

0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64
In [5]:
rev_age = ages.sort_index(ascending=False)
print(rev_age) # index의 역순
7    77
6    41
5    45
4    56
3    66
2    90
1    61
0    37
Name: Age, dtype: int64
In [6]:
print(ages*2)
print("\n=================================\n")

print(ages + ages.sort_index(ascending=False))
0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

=================================

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

ages * 2ages+ages.sort_index(ascending=False)의 결과값이 동일 벡터와 벡터의 연산은 일치하는 인덱스끼리 값을 수행

In [7]:
from IPython.core.display import HTML, display
display(HTML("<style> .container{width:100% !important;}</style>"))

'pandas > basic' 카테고리의 다른 글

07.handling_dataframe(bool-apply)  (0) 2018.12.09
06.handling_dataframe(bool)  (0) 2018.12.09
04.handling_series(basic)  (0) 2018.12.09
03.create_data_frame  (0) 2018.12.09
02.basic_statistic  (0) 2018.12.09
04_handling_series(basic)
In [1]:
import pandas as pd
from collections import OrderedDict
In [2]:
scientists = pd.DataFrame(OrderedDict([
    ["Occupation", ["Chemist", "Statistician"]],
    ["Born", ["1920-07-25", "1876-06-13"]],
    ["Died", ["1958-04-16", "1937-10-16"]],
    ["Age", [37, 61]]
]), index=["Rosaline Franklin", "William Gosset"])
In [3]:
first_row = scientists.loc["William Gosset"]
print(first_row)

## Age열에 정수를 전달해도 series의 자료형을 오브젝트로 인식
Occupation    Statistician
Born            1876-06-13
Died            1937-10-16
Age                     61
Name: William Gosset, dtype: object


index, values, keys

In [4]:
print("index: \t{}".format(first_row.index))
print("values: \t{}".format(first_row.values))
print("keys: {}".format(first_row.keys()))
index: 	Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')
values: 	['Statistician' '1876-06-13' '1937-10-16' 61]
keys: Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')
In [5]:
# index 속성 응용
# index 속성의 첫번째 값 추출
print("{}".format(first_row.index[0]))
Occupation
In [6]:
# key method 
# key method 결과값을 이용하여 인덱스의 첫번째 값을 추출
print("{}".format(first_row.keys()[0]))
Occupation


statistic

In [7]:
ages = scientists["Age"]
print(ages)
Rosaline Franklin    37
William Gosset       61
Name: Age, dtype: int64
In [8]:
print("mean: {}".format(ages.mean()))
print("min: {}".format(ages.min()))
print("max: {}".format(ages.max()))
print("std: {}".format(ages.std()))
mean: 49.0
min: 37
max: 61
std: 16.97056274847714


series method

series describe
append 2개 이상의 시리즈 연결
describe 요약 통계량 계산
drop_duplicates 중복값이 없는 시리즈
equals 시리즈에 해당 값을 요소가 있는지 확인
get_values 시리즈 값 구하기(values 속성과 동일)
isin 시리즈에 포합된 값이 있는지 확인
min 최소값
max 최댓값
mean 평균
median 중간값
replace 특정 값을 가진 시리즈 값을 교체
sample 임의의 값을 반환
sort_values 값을 정렬
to_frame 시리즈를 데이터프레임으로 변환
In [9]:
from IPython.core.display import HTML, display
display(HTML("<style> .container{width:100% !important;}</style>"))


'pandas > basic' 카테고리의 다른 글

06.handling_dataframe(bool)  (0) 2018.12.09
05.handling_series(apply)  (0) 2018.12.09
03.create_data_frame  (0) 2018.12.09
02.basic_statistic  (0) 2018.12.09
01.data_slicing  (0) 2018.12.09
03_create_data_frame
In [1]:
import pandas as pd
In [2]:
s = pd.Series(["bonggu", 31])
print(s)
0    bonggu
1        31
dtype: object


Series 생성

In [3]:
s = pd.Series(["bong_gu", "chatterBOX"])
s = pd.Series(["bong_gu", "cahtterBOX"], index=["person", "who"])
print(s)
person       bong_gu
who       cahtterBOX
dtype: object


DataFrame 생성

  • dictionary
In [4]:
scientists = pd.DataFrame({
    'Name': ["Bong_gu", "Yang_o"],
    "Job" : ["buyer", "Data_analysis"],
    "Born": ["SEOUL", "DONGHAE"],
    "Age": [31, 31],
    "Hate": ["JaeHyeop","JAeHyeop"]
}, index=["Bong_gu", "Yang_o"], columns=["Name", "Job", "Born", "Hate", "Age"] )
scientists
Out[4]:
Name Job Born Hate Age
Bong_gu Bong_gu buyer SEOUL JaeHyeop 31
Yang_o Yang_o Data_analysis DONGHAE JAeHyeop 31
In [5]:
## DataFrame에서 순서보장
from collections import OrderedDict

scientists = pd.DataFrame(OrderedDict([
    ["name", ["Rosaline Franklin", "William Gosset"]],
    ["Occupation", ["Chemist", "Statistician"]],
    ["Born", ["1920-07-25", "1876-06-13"]],
    ["Died", ["1958-04-16", "1937-10-16"]],
    ["Age", [37, 61]]
]))

print(scientists)
                name    Occupation        Born        Died  Age
0  Rosaline Franklin       Chemist  1920-07-25  1958-04-16   37
1     William Gosset  Statistician  1876-06-13  1937-10-16   61
In [6]:
from IPython.core.display import display, HTML

display(HTML("<style> .container{width:100% !important;}</style>"))

'pandas > basic' 카테고리의 다른 글

06.handling_dataframe(bool)  (0) 2018.12.09
05.handling_series(apply)  (0) 2018.12.09
04.handling_series(basic)  (0) 2018.12.09
02.basic_statistic  (0) 2018.12.09
01.data_slicing  (0) 2018.12.09
02_basic_statistic
In [1]:
import pandas as pd

df = pd.read_csv("./data/gapminder.tsv", sep="\t")
In [2]:
print(df.head(n=10))
       country continent  year  lifeExp       pop   gdpPercap
0  Afghanistan      Asia  1952   28.801   8425333  779.445314
1  Afghanistan      Asia  1957   30.332   9240934  820.853030
2  Afghanistan      Asia  1962   31.997  10267083  853.100710
3  Afghanistan      Asia  1967   34.020  11537966  836.197138
4  Afghanistan      Asia  1972   36.088  13079460  739.981106
5  Afghanistan      Asia  1977   38.438  14880372  786.113360
6  Afghanistan      Asia  1982   39.854  12881816  978.011439
7  Afghanistan      Asia  1987   40.822  13867957  852.395945
8  Afghanistan      Asia  1992   41.674  16317921  649.341395
9  Afghanistan      Asia  1997   41.763  22227415  635.341351


groupby

In [3]:
print(df.groupby("year")["lifeExp"].mean())
print("\n======================================\n")
print(df.groupby("continent")["country"].nunique())
print("\n======================================\n")
year
1952    49.057620
1957    51.507401
1962    53.609249
1967    55.678290
1972    57.647386
1977    59.570157
1982    61.533197
1987    63.212613
1992    64.160338
1997    65.014676
2002    65.694923
2007    67.007423
Name: lifeExp, dtype: float64

======================================

continent
Africa      52
Americas    25
Asia        33
Europe      30
Oceania      2
Name: country, dtype: int64

======================================

In [4]:
df.groupby(["year", "continent"])["lifeExp"].mean()
Out[4]:
year  continent
1952  Africa       39.135500
      Americas     53.279840
      Asia         46.314394
      Europe       64.408500
      Oceania      69.255000
1957  Africa       41.266346
      Americas     55.960280
      Asia         49.318544
      Europe       66.703067
      Oceania      70.295000
1962  Africa       43.319442
      Americas     58.398760
      Asia         51.563223
      Europe       68.539233
      Oceania      71.085000
1967  Africa       45.334538
      Americas     60.410920
      Asia         54.663640
      Europe       69.737600
      Oceania      71.310000
1972  Africa       47.450942
      Americas     62.394920
      Asia         57.319269
      Europe       70.775033
      Oceania      71.910000
1977  Africa       49.580423
      Americas     64.391560
      Asia         59.610556
      Europe       71.937767
      Oceania      72.855000
1982  Africa       51.592865
      Americas     66.228840
      Asia         62.617939
      Europe       72.806400
      Oceania      74.290000
1987  Africa       53.344788
      Americas     68.090720
      Asia         64.851182
      Europe       73.642167
      Oceania      75.320000
1992  Africa       53.629577
      Americas     69.568360
      Asia         66.537212
      Europe       74.440100
      Oceania      76.945000
1997  Africa       53.598269
      Americas     71.150480
      Asia         68.020515
      Europe       75.505167
      Oceania      78.190000
2002  Africa       53.325231
      Americas     72.422040
      Asia         69.233879
      Europe       76.700600
      Oceania      79.740000
2007  Africa       54.806038
      Americas     73.608120
      Asia         70.728485
      Europe       77.648600
      Oceania      80.719500
Name: lifeExp, dtype: float64


graph

In [5]:
%matplotlib inline
import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (12, 9)
plt.rcParams['axes.unicode_minus'] = False
In [6]:
lifeExp = df.groupby("year")["lifeExp"].mean()
lifeExp.plot()
plt.show()
In [7]:
from IPython.core.display import display, HTML

display(HTML("<style> .container{width:100% !important;}</style>"))

'pandas > basic' 카테고리의 다른 글

06.handling_dataframe(bool)  (0) 2018.12.09
05.handling_series(apply)  (0) 2018.12.09
04.handling_series(basic)  (0) 2018.12.09
03.create_data_frame  (0) 2018.12.09
01.data_slicing  (0) 2018.12.09
01_data_slicing
In [1]:
import pandas as pd
In [2]:
df = pd.read_csv("./data/gapminder.tsv", sep="\t")


데이터 확인하기

In [3]:
print(df.head()) # 앞부분
print("\n{}".format(type(df)))
print("\n{}".format(df.shape)) # shape
print("\n{}".format(df.columns)) # shape
print("\n{}\n".format(df.dtypes)) # columns type
print("{}".format(df.info())) # info
       country continent  year  lifeExp       pop   gdpPercap
0  Afghanistan      Asia  1952   28.801   8425333  779.445314
1  Afghanistan      Asia  1957   30.332   9240934  820.853030
2  Afghanistan      Asia  1962   31.997  10267083  853.100710
3  Afghanistan      Asia  1967   34.020  11537966  836.197138
4  Afghanistan      Asia  1972   36.088  13079460  739.981106

<class 'pandas.core.frame.DataFrame'>

(1704, 6)

Index(['country', 'continent', 'year', 'lifeExp', 'pop', 'gdpPercap'], dtype='object')

country       object
continent     object
year           int64
lifeExp      float64
pop            int64
gdpPercap    float64
dtype: object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 6 columns):
country      1704 non-null object
continent    1704 non-null object
year         1704 non-null int64
lifeExp      1704 non-null float64
pop          1704 non-null int64
gdpPercap    1704 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 80.0+ KB
None


데이터 추출

In [4]:
## columns 1개 추출
country_df = df["country"]
print(type(country_df))
print(country_df.head())
print("\n==============================\n")
print(country_df.tail())
<class 'pandas.core.series.Series'>
0    Afghanistan
1    Afghanistan
2    Afghanistan
3    Afghanistan
4    Afghanistan
Name: country, dtype: object

==============================

1699    Zimbabwe
1700    Zimbabwe
1701    Zimbabwe
1702    Zimbabwe
1703    Zimbabwe
Name: country, dtype: object


columns 3개 추출 "country", "continent", "year"

  • type
  • head
  • tail
In [5]:
subset = df[["country", "continent", "year"]]

print(subset.dtypes) ## type
print("\n==============================\n")
print(subset.head()) ## head
print("\n==============================\n")
print(subset.tail()) ## tail
country      object
continent    object
year          int64
dtype: object

==============================

       country continent  year
0  Afghanistan      Asia  1952
1  Afghanistan      Asia  1957
2  Afghanistan      Asia  1962
3  Afghanistan      Asia  1967
4  Afghanistan      Asia  1972

==============================

       country continent  year
1699  Zimbabwe    Africa  1987
1700  Zimbabwe    Africa  1992
1701  Zimbabwe    Africa  1997
1702  Zimbabwe    Africa  2002
1703  Zimbabwe    Africa  2007


loc 속성으로 데이터 추출

  • 인덱스가 음이 아닌 정수 사용
  • 인덱스 vs 행번호
In [6]:
## 인덱스가 0, 99인 데이터 추출

print(df.loc[0])
print("\n==============================\n")
print(df.loc[99])
country      Afghanistan
continent           Asia
year                1952
lifeExp           28.801
pop              8425333
gdpPercap        779.445
Name: 0, dtype: object

==============================

country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap       721.186
Name: 99, dtype: object
In [7]:
## 마지막 데이터 추출, shape()이용
### shape는 정수로서 실제크기를 말하지만
### pandas의 인덱스는 0부터 시작하기때문에 1을 빼줘야함

last_row = df.shape[0]-1
print(df.loc[last_row])

print("\n==============================\n")
## 마지막 데이터 추출, tail()이용
### n 인자 사용
print(df.tail(n=1))
country      Zimbabwe
continent      Africa
year             2007
lifeExp        43.487
pop          12311143
gdpPercap     469.709
Name: 1703, dtype: object

==============================

       country continent  year  lifeExp       pop   gdpPercap
1703  Zimbabwe    Africa  2007   43.487  12311143  469.709298
In [8]:
### 인덱스가 0, 99, 999인 데이터를 추출
### list사용

print(df.loc[[0, 99, 999]])
         country continent  year  lifeExp       pop    gdpPercap
0    Afghanistan      Asia  1952   28.801   8425333   779.445314
99    Bangladesh      Asia  1967   43.453  62821884   721.186086
999     Mongolia      Asia  1967   51.253   1149500  1226.041130


iloc 속성으로 데이터 추출

  • loc는 데이터프레임의 인덱스를 사용하여 데이터를 추출
  • iloc는 행번호를 사용하여 데이터를 추출
In [9]:
print(df.iloc[1])
print("\n==============================\n")
print(df.iloc[99])
country      Afghanistan
continent           Asia
year                1957
lifeExp           30.332
pop              9240934
gdpPercap        820.853
Name: 1, dtype: object

==============================

country      Bangladesh
continent          Asia
year               1967
lifeExp          43.453
pop            62821884
gdpPercap       721.186
Name: 99, dtype: object
In [10]:
### iloc에 -1은 행 데이터를 추출
print(df.iloc[-1])
print(df.iloc[1710])
country      Zimbabwe
continent      Africa
year             2007
lifeExp        43.487
pop          12311143
gdpPercap     469.709
Name: 1703, dtype: object

IndexErrorTraceback (most recent call last)
<ipython-input-10-8a802f6f3585> in <module>
      1 ### iloc에 -1은 행 데이터를 추출
      2 print(df.iloc[-1])
----> 3 print(df.iloc[1710])

/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in __getitem__(self, key)
   1476 
   1477             maybe_callable = com._apply_if_callable(key, self.obj)
-> 1478             return self._getitem_axis(maybe_callable, axis=axis)
   1479 
   1480     def _is_scalar_access(self, key):

/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _getitem_axis(self, key, axis)
   2100 
   2101             # validate the location
-> 2102             self._validate_integer(key, axis)
   2103 
   2104             return self._get_loc(key, axis=axis)

/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _validate_integer(self, key, axis)
   2007         l = len(ax)
   2008         if key >= l or key < -l:
-> 2009             raise IndexError("single positional indexer is out-of-bounds")
   2010 
   2011     def _getitem_tuple(self, tup):

IndexError: single positional indexer is out-of-bounds
In [11]:
print(df.iloc[[0, 99, 999]])
         country continent  year  lifeExp       pop    gdpPercap
0    Afghanistan      Asia  1952   28.801   8425333   779.445314
99    Bangladesh      Asia  1967   43.453  62821884   721.186086
999     Mongolia      Asia  1967   51.253   1149500  1226.041130
In [12]:
### : -- 모든행
subset = df.loc[:, ["year", "pop"]]
print(subset.head())

subset = df.iloc[:, [2, 4, -1]]
print(subset.head())
   year       pop
0  1952   8425333
1  1957   9240934
2  1962  10267083
3  1967  11537966
4  1972  13079460
   year       pop   gdpPercap
0  1952   8425333  779.445314
1  1957   9240934  820.853030
2  1962  10267083  853.100710
3  1967  11537966  836.197138
4  1972  13079460  739.981106
In [13]:
small_range = range(5) ## 

subset = df.iloc[:, small_range]
print(subset.head())

print("\n==============================\n")
range2 = list(range(3, 6))
subset = df.iloc[:, range2]
print(subset.head())
       country continent  year  lifeExp       pop
0  Afghanistan      Asia  1952   28.801   8425333
1  Afghanistan      Asia  1957   30.332   9240934
2  Afghanistan      Asia  1962   31.997  10267083
3  Afghanistan      Asia  1967   34.020  11537966
4  Afghanistan      Asia  1972   36.088  13079460

==============================

   lifeExp       pop   gdpPercap
0   28.801   8425333  779.445314
1   30.332   9240934  820.853030
2   31.997  10267083  853.100710
3   34.020  11537966  836.197138
4   36.088  13079460  739.981106
In [14]:
range3 = list(range(0, 6, 2))
subset = df.iloc[:, range3]
print(subset.head())
print("\n==============================\n")
subset = df.iloc[:, 0:6:2]
print(subset.head())
       country  year       pop
0  Afghanistan  1952   8425333
1  Afghanistan  1957   9240934
2  Afghanistan  1962  10267083
3  Afghanistan  1967  11537966
4  Afghanistan  1972  13079460

==============================

       country  year       pop
0  Afghanistan  1952   8425333
1  Afghanistan  1957   9240934
2  Afghanistan  1962  10267083
3  Afghanistan  1967  11537966
4  Afghanistan  1972  13079460
In [15]:
### loc, iloc, 데이터셋이 클 수록 loc 속성이 유리
print(df.iloc[[0, 99, 999], [0, 3, 5]])
print(df.loc[[0, 99, 999], ["country", "lifeExp", "gdpPercap"]])
         country  lifeExp    gdpPercap
0    Afghanistan   28.801   779.445314
99    Bangladesh   43.453   721.186086
999     Mongolia   51.253  1226.041130
         country  lifeExp    gdpPercap
0    Afghanistan   28.801   779.445314
99    Bangladesh   43.453   721.186086
999     Mongolia   51.253  1226.041130
In [16]:
from IPython.core.display import display, HTML

display(HTML("<style> .container{width:100% !important;}</style>"))

'pandas > basic' 카테고리의 다른 글

06.handling_dataframe(bool)  (0) 2018.12.09
05.handling_series(apply)  (0) 2018.12.09
04.handling_series(basic)  (0) 2018.12.09
03.create_data_frame  (0) 2018.12.09
02.basic_statistic  (0) 2018.12.09

+ Recent posts