[๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ&์๊ฐํ 3] ๋ถ๋ฆฌ์ธ ์ธ๋ฑ์ฑ, ๋ฐ์ดํฐ ๋ณํฉ
< ๋ถ๋ฆฌ์ธ ์ธ๋ฑ์ฑ >
# ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ
df = pd.read_csv("tips_data.csv")
df
# ๋ถ๋ฆฌ์ธ ์ธ๋ฑ์ฑ
๊ฐ๋ก ์ถ์ผ๋ก ๋ถ๋ฆฌ์ธ ์ธ๋ฑ์ฑ์ ์ ์ฉํ๋ค
[ ์์ 1 ]
selected = df['sex'] == 'Male'
df[selected]
[ ์์ 2 ]
์ฌ๋ฌ๊ฐ์ง ์กฐ๊ฑด์ & ์ฐ์ฐ์๋ก ์ด์ด์ฃผ๊ธฐ
ํ์์ ( ) & ( )
๋ฐ๋์ ๊ดํธ๋ฅผ ๋ฃ์ด์ค๊ฒ!!
selected = (df['sex'] == 'Male') & (df['smoker'] == 'Yes')
df[selected]
[ ์์ 3 ]
์กฐ๊ธ ๋ ๊ฐํธํ๊ฒ ๋ํ๋ด๊ธฐ
๋ถ๋ฆฌ์ธ ์กฐ๊ฑด์ ๋ํ๋ด๋ ๋ณ์์ ๋ด์์ ๋ํ๋ด๊ธฐ
selected = (df['sex'] == 'Male') | (df['smoker'] == 'Yes')
df[selected]
[ ์์ 4 ]
์กฐ๊ธ ๋ ๊ฐํธํ๊ฒ ๋ํ๋ด๊ธฐ
์กฐ๊ฑด๋ผ๋ฆฌ ๋ฌถ์์๋ ์๋ค
cond1 = df['size'] >= 3
cond2 = df['tip'] < 2
df[cond1 & cond2]
[ ์์ 5 ]
์ฌ๋ฌ๊ฐ์ง ์กฐ๊ฑด์ \๋ฅผ ์ฌ์ฉํ์ฌ ๋ฌถ์ด์ฃผ๊ธฐ
cond = (df['sex'] == 'Male') \
& (df['tip'] > 3) \
& (df['smoker'] == 'Yes') \
& (df['total_bill'] < 20) \
& (df['size'] == 4)
df[cond]
# isin()์ ์ด์ฉํ ํํฐ๋ง
Series(์๋ฆฌ์ฆ)๋ DataFrame(๋ฐ์ดํฐํ๋ ์)์ ๊ฐ๋ค ์ค์์ ํน์ ๊ฐ์ด๋ ๋ฆฌ์คํธ ์์ ํฌํจ๋ ๊ฐ๋ค์ ์ฐพ์๋ด๋ ๋ฉ์๋.
์ํ๋ ์กฐ๊ฑด์ ํด๋นํ๋ ๋ฐ์ดํฐ๋ฅผ ๋น ๋ฅด๊ฒ ํํฐ๋งํ๊ฑฐ๋ ์ ํํ ์ ์์
isin() ์ [ ] ๋ฆฌ์คํธ ์์ ๊ฐ์ ๋ฃ๋๋ค
1) ๋จ์ผ ๊ฐ ํฌํจ ์ฌ๋ถ ํ์ธ
# Series๋ DataFrame์ ํน์ ์ด์์ ๋จ์ผ ๊ฐ์ด ํฌํจ๋์ด ์๋์ง ์ฌ๋ถ๋ฅผ ํ์ธํฉ๋๋ค.
import pandas as pd
data = {'A': [1, 2, 3, 4, 5],
'B': ['apple', 'banana', 'orange', 'grape', 'melon']}
df = pd.DataFrame(data)
df
# 'B' ์ด์์ 'banana' ๊ฐ์ด ์๋์ง ํ์ธ
result = df['B'].isin(['banana'])
print(result)
2) ์ฌ๋ฌ ๊ฐ ํฌํจ ์ฌ๋ถ ํ์ธ
df[df['day'].isin(['Sun', 'Thur'])]
3) ๋ฐ์ดํฐํ๋ ์ ์ ์ฒด์์์ ์ฌ์ฉ
๋์ ๋๋ฆฌ ํํ๋ก ์ฌ์ฉํ๋ค
result = df.isin({'A': [1, 3], 'B': ['apple', 'orange']})
print(result)
# ์ปฌ๋ผ ์ถ๊ฐํ๊ธฐ - ๋จ์๊ฐ
df['created_at'] = '2024-01-01'
# ๋ฐ์ดํฐํ ํ์ธํ๊ธฐ
df['created_at'].dtype
# ๊ฒฐ๊ณผ
dtype('O')
โป dtype('O')์ ๋ป์ด๋
Object ํ์ด๋ผ๋ ๋ป์ด๋ค!
df.info()
# ๋ ์ง ํ์์ผ๋ก ๋ฐ๊พธ๊ธฐ
df['created_at'] = pd.to_datetime(df['created_at'])
df.info()
# ์ปฌ๋ผ ์ถ๊ฐํ๊ธฐ - ์ฐ์ฐ์ผ๋ก ์ถ๊ฐํ๊ธฐ
df['revenue'] = df['total_bill'] + df['tip']
# ๋ฐฑ๋ถ์จ ํผ์ผํธ ์ถ๊ฐํ๊ธฐ
df['tip_percentage'] = df['tip'] / df['revenue'] * 100
< ๋ฐ์ดํฐ ๋ณํฉ >
# ๋ฐ์ดํฐ ๋ณํฉ ์์ ๋ฐ์ดํฐ
# ๋ฐ์ดํฐ ๋ณํฉ
df1 = pd.DataFrame({
'A' : ['A0', 'A1', 'A2', 'A3'],
'B' : ['B0', 'B1', 'B2', 'B3'],
'C' : ['C0', 'C1', 'C2', 'C3'],
'D' : ['D0', 'D1', 'D2', 'D3'],
})
df2 = pd.DataFrame({
'A' : ['A4', 'A5', 'A6', 'A7'],
'B' : ['B4', 'B5', 'B6', 'B7'],
'C' : ['C4', 'C5', 'C6', 'C7'],
'D' : ['D4', 'D5', 'D6', 'D7'],
})
df3 = pd.DataFrame({
'A' : ['A8', 'A9', 'A10', 'A11'],
'B' : ['B8', 'B9', 'B10', 'B11'],
'C' : ['C8', 'C9', 'C10', 'C11'],
'D' : ['D8', 'D9', 'D10', 'D11'],
})
< 1. concat >
concat ํจ์๋ ๋ฐ์ดํฐํ๋ ์์ ์์๋ ํน์ ์ข์ฐ๋ก ํฉ์น ์ ์๋ค
- axis
์ฐ๊ฒฐํ๊ณ ์ ํ๋ ์ถ(๋ฐฉํฅ)์ ์ง์ ํฉ๋๋ค.
๊ธฐ๋ณธ๊ฐ์ 0์ผ๋ก, ์์๋๋ก ์ฐ๊ฒฐํ๋ ๊ฒฝ์ฐ์ ํด๋นํฉ๋๋ค. 1๋ก ์ค์ ํ๋ฉด ์ข์ฐ๋ก ์ฐ๊ฒฐํฉ๋๋ค.
- ignore_index
๊ธฐ๋ณธ๊ฐ์ False์ด๋ฉฐ, ์ฐ๊ฒฐ๋ ๊ฒฐ๊ณผ ๋ฐ์ดํฐํ๋ ์์ ์ธ๋ฑ์ค๋ฅผ ์ ์งํฉ๋๋ค. True๋ก ์ค์ ํ๋ฉด ์๋ก์ด ์ธ๋ฑ์ค๋ฅผ ์์ฑํฉ๋๋ค. (True → ๊ธฐ์กด ์ธ๋ฑ์ค๋ฅผ ๋ฌด์ํ๊ณ ์๋กญ๊ฒ ์ธ๋ฑ์ค๋ฅผ ์ค์ )
# ์ํ ๋ฐ์ดํฐ
# ๋ฐ์ดํฐ ๋ณํฉ
df1 = pd.DataFrame({
'A' : ['A0', 'A1', 'A2', 'A3'],
'B' : ['B0', 'B1', 'B2', 'B3'],
'C' : ['C0', 'C1', 'C2', 'C3'],
'D' : ['D0', 'D1', 'D2', 'D3'],
})
df2 = pd.DataFrame({
'A' : ['A4', 'A5', 'A6', 'A7'],
'B' : ['B4', 'B5', 'B6', 'B7'],
'C' : ['C4', 'C5', 'C6', 'C7'],
'D' : ['D4', 'D5', 'D6', 'D7'],
})
df3 = pd.DataFrame({
'A' : ['A8', 'A9', 'A10', 'A11'],
'B' : ['B8', 'B9', 'B10', 'B11'],
'C' : ['C8', 'C9', 'C10', 'C11'],
'D' : ['D8', 'D9', 'D10', 'D11'],
})
# concat์ผ๋ก ํฉ์น๊ธฐ
pd.concat([df1, df2, df3])
pd.concat([df1, df2, df3]).reset_index(drop=True)
# axis = 0 ์ธ ์์๋๊ฐ ๊ธฐ๋ณธ๊ฐ์ด๋ค
# ๊ธฐ๋ณธ๊ฐ์ ์์๋์ธ axis = 0
pd.concat([df1, df2, df3], axis=0).reset_index(drop=True)
# ๋ฐ์ดํฐ ์ํ
# ๋ฐ์ดํฐ ๊ฐ์ ธ์ค๊ธฐ
df1 = pd.DataFrame({
'A' : ['A0', 'A1', 'A2', 'A3'],
'B' : ['B0', 'B1', 'B2', 'B3'],
'C' : ['C0', 'C1', 'C2', 'C3'],
'D' : ['D0', 'D1', 'D2', 'D3'],
})
df2 = pd.DataFrame({
'A' : ['A4', 'A5', 'A6', 'A7'],
'B' : ['B4', 'B5', 'B6', 'B7'],
'C' : ['C4', 'C5', 'C6', 'C7'],
'D' : ['D4', 'D5', 'D6', 'D7'],
})
# concat์์ axis = 1 ๊ฐ๋ก์ธ๋ก ํด๋ณด๊ธฐ
df = pd.concat([df1, df2], axis=1)
df
# ๋ง์ฝ ์์๋ ๋ถ์ผ๋ ๋ค๋ฅด๋ค๋ฉด
# ๋ง์ฝ ์์๋ ๋ถ์ผ๋ ๋ค๋ฅด๋ค๋ฉด
# ๋ฐ์ดํฐ ๋ณํฉ
df1 = pd.DataFrame({
'A' : ['A0', 'A1', 'A2', 'A3'],
'B' : ['B0', 'B1', 'B2', 'B3'],
'C' : ['C0', 'C1', 'C2', 'C3'],
'D' : ['D0', 'D1', 'D2', 'D3'],
})
df2 = pd.DataFrame({
'A' : ['A4', 'A5', 'A6', 'A7'],
'B' : ['B4', 'B5', 'B6', 'B7'],
'C' : ['C4', 'C5', 'C6', 'C7'],
'D' : ['D4', 'D5', 'D6', 'D7'],
})
df3 = pd.DataFrame({
'A' : ['A8', 'A9', 'A10', 'A11'],
'B' : ['B8', 'B9', 'B10', 'B11'],
'C' : ['C8', 'C9', 'C10', 'C11'],
'D' : ['D8', 'D9', 'D10', 'D11'],
'E' : ['E8', 'E9', 'E10', 'E11']
})
df = pd.concat([df1, df2, df3], axis=0).reset_index(drop=True)
df
# ๋ง์ฝ ๊ฐ๋ก์ธ๋ก ๋ถ์ผ๋ ๋ค๋ฅด๋ค๋ฉด
# ๋ง์ฝ ๊ฐ๋ก์ธ๋ก ๋ถ์ผ๋ ๋ค๋ฅด๋ค๋ฉด
# ๋ฐ์ดํฐ ๋ณํฉ
df1 = pd.DataFrame({
'A' : ['A0', 'A1', 'A2', 'A3'],
'B' : ['B0', 'B1', 'B2', 'B3'],
'C' : ['C0', 'C1', 'C2', 'C3'],
'D' : ['D0', 'D1', 'D2', 'D3'],
})
df2 = pd.DataFrame({
'A' : ['A4', 'A5', 'A6', 'A7','A8'],
'B' : ['B4', 'B5', 'B6', 'B7','B8'],
'C' : ['C4', 'C5', 'C6', 'C7','C8'],
'D' : ['D4', 'D5', 'D6', 'D7','D8'],
})
df = pd.concat([df1, df2], axis=1)
df
< 2. merge >
๋ฐ์ดํฐ๋ฅผ ์ข์ฐ๋ก ํฉ์น๊ธฐ
SQL์ JOIN ์ฐ์ฐ๊ณผ ์ ์ฌํ ๋ฐฉ์์ผ๋ก ๋ฐ์ดํฐํ๋ ์์ ํฉ์น ์ ์์ต๋๋ค.
์ฃผ๋ก ๋ ๊ฐ ์ด์์ ๋ฐ์ดํฐํ๋ ์์์ ๊ณตํต๋ ์ด์ด๋ ์ธ๋ฑ์ค๋ฅผ ๊ธฐ์ค์ผ๋ก ๋ฐ์ดํฐ๋ฅผ ๋ณํฉํ ๋ ํ์ฉ๋ฉ๋๋ค.
# how
๋ณํฉ ๋ฐฉ๋ฒ์ ๋ํ๋ด๋ ๋งค๊ฐ๋ณ์๋ก, 'inner', 'outer', 'left', 'right' ๋ฑ์ ์ต์ ์ด ์์ต๋๋ค.
1) inner
๊ณตํต๋ ํค(์ด)๋ฅผ ๊ธฐ์ค์ผ๋ก ๊ต์งํฉ์ ๋ง๋ญ๋๋ค.
2) outer
๊ณตํต๋ ํค๋ฅผ ๊ธฐ์ค์ผ๋ก ํฉ์งํฉ์ ๋ง๋ญ๋๋ค.
3) left
์ผ์ชฝ ๋ฐ์ดํฐํ๋ ์์ ๋ชจ๋ ํ์ ํฌํจํ๊ณ ์ค๋ฅธ์ชฝ ๋ฐ์ดํฐํ๋ ์์ ๊ณตํต๋ ํค์ ํด๋นํ๋ ํ๋ง ํฌํจํฉ๋๋ค.
4) right
์ค๋ฅธ์ชฝ ๋ฐ์ดํฐํ๋ ์์ ๋ชจ๋ ํ์ ํฌํจํ๊ณ ์ผ์ชฝ ๋ฐ์ดํฐํ๋ ์์ ๊ณตํต๋ ํค์ ํด๋นํ๋ ํ๋ง ํฌํจํฉ๋๋ค.
# ๋ฐ์ดํฐํ๋ ์ ์์
# 5-2. merge
df1 = pd.DataFrame({
'key' : ['A', 'B', 'C', 'D'],
'value' : [1,2,3,4]
})
df2 = pd.DataFrame({
'key' : ['B', 'D', 'D', 'E'],
'value' : [5,6,7,8]
})
# inner
# ๊ธฐ๋ณธ๊ฐ์ inner
pd.merge(df1, df2, on='key', how='inner')
# outer
pd.merge(df1, df2, on='key', how='outer')
# left
pd.merge(df1, df2, on='key', how='left')