< ๋ฐ์ดํฐ ์ง๊ณ >
# Group by()
Group by ํจ์๋ ๋ฐ์ดํฐํ๋ ์์ ๊ทธ๋ฃนํํ์ฌ ๊ทธ๋ฃน์ ๋ํ ์ฐ์ฐ์ ์ํํ ํ ๊ฒฐ๊ณผ๋ฅผ ์ถ๋ ฅํ๋ค
# ์ํ ๋ฐ์ดํฐ
# Group by ๋ฐฐ์ฐ๊ธฐ
df = pd.DataFrame({
'Category' : ['A','B','A','B','A','B'],
'Value' : [1,2,3,4,5,6]
})

# ์นดํ ๊ณ ๋ฆฌ๋ณ ํ๊ท ๊ฐ ์ถ๋ ฅ
df.groupby('Category').mean()

# ์นดํ ๊ณ ๋ฆฌ๋ณ ๊ฐ ๊ตฌํ๊ธฐ
ํฉ๊ณ
df.groupby('Category').sum()

๋ฐ์ดํฐ์ ์
df.groupby('Category').count()

์ต๋๊ฐ
df.groupby('Category').max()

์ต์๊ฐ
df.groupby('Category').min()

์ฒซ๋ฒ์งธ ๋ฐ์ดํฐ
df.groupby('Category').first()

# agg ํจ์
์ฌ๋ฌ๊ฐ์ ํจ์๋ฅผ ๋์์ ์คํํ ์ ์๋ค
df.groupby('Category').agg(list)

# ์ํ ๋ฐ์ดํฐ ๊ฐ์ ธ์ค๊ธฐ
df = pd.read_csv("tips_data.csv")
df

df[['day', 'total_bill', 'tip', 'size']]

# group by ์์
groupby ์ปฌ๋ผ์ ๋ํ ๊ฒ์ ์ผ์ชฝ ์ปฌ๋ผ์ ๋ฐฐ์น๋๊ณ
๊ฐ๊ฐ์ ๋ํด์ ์ง๊ณํจ์๋ฅผ ์ํํ๋ค
df[['day', 'total_bill', 'tip', 'size']].groupby('day').mean()

# 2๊ฐ ์ด์์ groupby
df[['sex', 'day', 'total_bill', 'tip', 'size']].groupby(['sex', 'day']).mean()

df[['sex', 'day', 'total_bill', 'tip', 'size']].groupby(['day', 'sex']).mean()

# ํด๋น groupby์ ๊ธฐ์คํ์ ๊ฐ๊ฐ์ ์ปฌ๋ผ์ ๋ํด์ ๋ค๋ฅธ ํจ์์ธ ์ต๋๊ฐ, ํ๊ท ๊ฐ, ํฉ๊ณ ๊ตฌํ๊ธฐ
์ฌ๋ฌ๊ฐ์ง ์กฐ๊ฑด์ \ ๋ฐฑ์ฌ๋์๋ฅผ ์ด์ฉํด์ ๋ฌถ์ด์ค ์ ์๋ค
df[['sex', 'day', 'total_bill', 'tip', 'size']].groupby(['day', 'sex']) \
.agg({'total_bill':'max', 'tip':'mean', 'size':'sum' })

โป ๊ทธ๋ ๋ค๋ฉด ์ฌ๊ธฐ์ ์ง๋ฌธ๋ค์ด๊ฐ๋๋ค!

Fril Female์ ์๋ total_bill์ ์ ๊ทผํ๋ ค๋ฉด ์ด๋ป๊ฒ ํด์ผ ํ ๊น์?
df.loc[('Fri', 'Female'), 'total_bill']
๋ก ์ ๊ทผํ ์ ์๋ค
[ ๊ฒฐ๊ณผ ]

< ํผ๋ฒ ํ ์ด๋ธ >
Pivot Table๋?
๋ฐ์ดํฐ๋ฅผ ์ฌ๊ตฌ์ฑํ์ฌ ์์ฝ, ์ง๊ณ๋ ์ ๋ณด๋ฅผ ๋ณด์ฌ์ฃผ๋ ํ ์ด๋ธ ํํ
# ์ํ ๋ฐ์ดํฐ
# 6-2. pivot table
df = pd.DataFrame({
'Date' : ['2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02', '2023-01-01'],
'Category' : ['A', 'B', 'A', 'B', 'A'],
'Value' : [10, 20, 30, 40, 50]
})
df

# ํผ๋ฒ ํ ์ด๋ธ ๋ง๋ค๊ธฐ
pivot = df.pivot_table(index='Date', columns='Category', values='Value', aggfunc='sum')

# ๋ฐ์ดํฐ ์ํ
df = pd.DataFrame({
'Date' : ['2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02', '2023-01-01'],
'Category' : ['A', 'B', 'A', 'B', 'A'],
'SubCategory' : ['X', 'X', 'Y', 'Y', 'X'],
'Value' : [10, 20, 30, 40, 50]
})
df

# ํผ๋ฒ ํ ์ด๋ธ ๋ง๋ค๊ธฐ
pivot = df.pivot_table(index='Date', columns=['Category', 'SubCategory'], values='Value', aggfunc='sum')
pivot

# ์ง๊ณ ํจ์๋ฅผ ๋ค๋ฅด๊ฒ ํ์ฌ ํผ๋ฒ ํ ์ด๋ธ ์์ฑํ๊ธฐ
import pandas as pd
# ์ํ ๋ฐ์ดํฐํ๋ ์ ์์ฑ
data = {
'Date': ['2023-01-01', '2023-01-01', '2023-01-02', '2023-01-02', '2023-01-01'],
'Category': ['A', 'B', 'A', 'B', 'A'],
'Value1': [10, 20, 30, 40, 50],
'Value2': [100, 200, 300, 400, 500]
}
df = pd.DataFrame(data)
df

# ํผ๋ฒ ํ
์ด๋ธ ์์ฑ: 'Date'๋ฅผ ํ ์ธ๋ฑ์ค๋ก, 'Category'๋ฅผ ์ด ์ธ๋ฑ์ค๋ก, ๊ฐ์ 'Value1'๊ณผ 'Value2'์ ํ๊ท ๊ณผ ํฉ์ผ๋ก ์ง๊ณ
pivot = df.pivot_table(index='Date', columns='Category', values=['Value1', 'Value2'], aggfunc={'Value1': 'mean', 'Value2': 'sum'})
print(pivot)

< ๋ฐ์ดํฐ ์ ๋ ฌํ๊ธฐ >
# ์ํ ๋ฐ์ดํฐ
df = pd.DataFrame({
'Name' : ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
'Age' : [25, 22, 30, 30, 18],
'Score' : [85, 88, 83, 90, 92]
})
df

1. ๊ฐ ์ ๋ ฌํ๊ธฐ
# ๋์ด๋ฅผ ๊ธฐ์ค์ผ๋ก ์ ๋ ฌํ๊ธฐ, ์ค๋ฆ์ฐจ์
๊ธฐ๋ณธ๊ฐ์ ์ค๋ฆ์ฐจ์์ด๋ค
ํ์ด์ฌ์ sort() ๊ธฐ๋ณธ๊ฐ๊ณผ ๋์ผ
df.sort_values(by='Age')

# ๋์ด๋ฅผ ๊ธฐ์ค์ผ๋ก ์ ๋ ฌํ๊ธฐ, ๋ด๋ฆผ์ฐจ์
df.sort_values(by='Age', ascending=False)

# 2๊ฐ ์ด์์ ๋ฐ์ดํฐ๋ฅผ ๊ธฐ์ค์ผ๋ก ์ ๋ ฌํ๊ธฐ
df.sort_values(by=['Age','Score'], ascending=[True, False])

2. ์ธ๋ฑ์ค ์ ๋ ฌํ๊ธฐ
df.sort_index(ascending=False)
