# ๋ฐ์ดํฐ ์ถ์ฒ
kaggle์ Olist๊ธฐ์ ๋ฐ์ดํฐ ๋ถ์
โ 1. Recency
R์ '๊ณ ๊ฐ์ด ์ต๊ทผ์ ๊ตฌ๋งคํ๋๊ฐ?'๋ฅผ ์๋ฏธํฉ๋๋ค. ๊ณ ๊ฐ์ ๋ง์ง๋ง ํ๋์ด ์ธ์ ์ธ์ง๋ฅผ ๋ํ๋ด๋ ๋ณ์๋ก, ์ต๊ทผ์ ๊ตฌ๋งคํ ๊ณ ๊ฐ์ผ์๋ก ๋์ ์ ์๊ฐ ๋ถ์ฌ๋ฉ๋๋ค.
๊ฐ์ ๊ธธ์ด๋ก ๊ตฌ๊ฐ์ ๋๋๋ pd.cut() ๋๋ ๊ฐ์ ๊ฐ์๋ก ๊ตฌ๊ฐ์ ๋๋๋ pd.qcut()์ด ์์ต๋๋ค
recency = olist_df.groupby('customer_id')['order_purchase_timestamp'].max().reset_index()

max_date = recency['order_purchase_timestamp'].max()
recency['diff_date'] = (max_date - recency['order_purchase_timestamp']).dt.days
recency

recency['R_score'] = pd.qcut(recency['diff_date'], 5, labels=[5,4,3,2,1])


โ 2. Frequency
F๋ '๊ณ ๊ฐ์ด ์ผ๋ง๋ ์์ฃผ ๊ตฌ๋งคํ๋๊ฐ?'๋ฅผ ๋ํ๋
๋๋ค.
๊ณ ๊ฐ์ด ์ผ๋ง๋ ์์ฃผ ๊ตฌ๋งคํ๋์ง๋ฅผ ๋ํ๋ด๋ ๋ณ์๋ก, ์์ฃผ ๊ตฌ๋งคํ ๊ณ ๊ฐ์ผ์๋ก ๋์ ์ ์๊ฐ ๋ถ์ฌ๋ฉ๋๋ค.
frequency = olist_df.groupby('customer_id')['order_purchase_timestamp'].nunique().sort_values(ascending=False).reset_index()

sns.histplot(frequency['order_purchase_timestamp'])

def parse_values(X):
dict1 = {
1 : 1,
2 : 2,
3 : 3,
4 : 4
}
return dict1.get(X,5)
frequency['F_score'] = frequency['order_purchase_timestamp'].apply(parse_values)

โ 3. Monetary
M์ '๊ณ ๊ฐ์ด ์ผ๋ง๋ ๊ตฌ๋งคํ๋?'๋ฅผ ์ธก์ ํ๋ ์งํ์
๋๋ค.
๊ณ ๊ฐ์ด ๊ตฌ๋งคํ ์ด ๊ธ์ก์ ์๋ฏธํ๋ ๋ณ์๋ก, ๊ตฌ๋งค ๊ธ์ก์ด ๋์ ๊ณ ๊ฐ์ผ์๋ก ๋์ ์ ์๊ฐ ๋ถ์ฌ๋ฉ๋๋ค.
monetary = olist_df.groupby('customer_id')['payment_value'].sum().reset_index()

monetary['M_score'] = pd.qcut(monetary['payment_value'], 5, labels=[1,2,3,4,5])

โ 4. merge
rfm_score = recency.merge(frequency, on = 'customer_id')
rfm_score = rfm_score.merge(monetary, on = 'customer_id')

ํ ๋ณํ ํ ํฉ์น๊ธฐ
# rfm_score์ ํฉ์น๊ธฐ ์ํด string์ผ๋ก ํ ๋ณํ
rfm_score['r_score'] = rfm_score['R_score'].astype(str)
rfm_score['f_score'] = rfm_score['F_score'].astype(str)
rfm_score['m_score'] = rfm_score['M_score'].astype(str)
# rfm_score ํฉ์น๊ธฐ
rfm_score['rfm_score'] = rfm_score['r_score'] + rfm_score['f_score'] + rfm_score['m_score']

โ 5. ์ธ๊ทธ๋จผํ ์ด์ ์งํ
segments = {
r'555' : 'VIP ๊ณ ๊ฐ',
r'211|212|213|221|222|223|231|232|233' : '์ดํ ์ฐ๋ ค ๊ณ ๊ฐ',
r'111|112|113|114|115|121|122|123|124|125|131|132|133|134|135|141|142|143|144|145|151|152|153|154|155': '์ดํ ๊ณ ๊ฐ',
r'433|434|443|444|453|454' : '์ถฉ์ฑ ๊ณ ๊ฐ',
r'511|512|513|514|515' : '์ ๊ท ๊ณ ๊ฐ',
r'214|215|224|225|234|235|241|242|243|244|245|251|252|253|254|255|311|312|313|314|315|321|322|323|324|325|331|332|333|334|335|341|342|343|344|345|351|352|353|354|355|411|412|413|414|415|421|422|423|424|425|431|432|441|442|445|451|452|455|551|552|553|554|435|521|522|523|524|525|531|532|535|542|543|544|545|533|534': '์ผ๋ฐ ๊ณ ๊ฐ'
}
rfm_score2['segment'] = rfm_score2['rfm_score'].replace(segments, regex=True)


'Knowledge๐ฆข > Python' ์นดํ ๊ณ ๋ฆฌ์ ๋ค๋ฅธ ๊ธ
| [Python] EDA ๊ธฐ๋ณธ & ๋ณํ ํจ์ ์ ๋ฆฌ (0) | 2024.08.22 |
|---|---|
| [Python] ์์ดํ ๋ณ ์กฐํฉ ์ฐพ๊ธฐ, 2๊ฐ์ ํ ์ด๋ธ ํฉ์น๊ธฐโก๏ธ (0) | 2024.07.11 |
| [Python] ์นดํ ๊ณ ๋ฆฌ ๋ถ๋ฅํ๊ธฐโก๏ธ (0) | 2024.07.10 |
| [Python] ํ์ด์ฌ iterable ์ ๋ป (0) | 2024.05.08 |
| [Python] ํ์ด์ฌ ๊ธฐ์ด๋ฌธ๋ฒ ๋ณต์ต (0) | 2024.04.05 |