tak's data blog

빠르게 보는 SQL 레시피 본문

SQL

빠르게 보는 SQL 레시피

hyuntaek 2024. 6. 15. 00:20

 

SQL 레시피 책을 공부하면서 현업에서 쓰일법한 코드들을 타이핑 해두었고,

빠르게 필요할 때마다 보기 위해서 다음과 같이 정리해두었다.

 

한번 훑어보긴 했지만 아직 직접 적용해보지 못한 코드가 있어서 추후에 다시 작성해야겠다.

-- 사용자의 방문 빈도 집계하기
WITH
action_log_with_dt AS 
(
SELECT *
    ,SUBSTR(STAMP, 1, 10) AS dt
  FROM action_log
)
,action_day_count_per_user AS 
(
SELECT 
        user_id
       ,COUNT(DISTINCT dt) AS action_day_count
  FROM
    action_log_with_dt
 WHERE
    dt BETWEEN '2016-11-01' AND '2016-11-07'
GROUP BY user_id
)

SELECT 
        action_day_count
        ,COUNT(DISTINCT user_id) AS user_count
  FROM action_day_count_per_user
GROUP BY action_day_count
ORDER BY action_day_count;

 

 

 

 

 

-- 구매액이 많은 순서로 사용자 그룹을 10등분하는 쿼리
WITH user_purchase_amount AS
(
SELECT
        user_id
        ,SUM(amount) AS purchase_amount
  FROM action_log
 WHERE action = 'purchase'
GROUP BY user_id
)
, users_with_decile AS
(
SELECT 
        user_id
        ,purchase_amount
        ,ntile(10) OVER (ORDER BY purchase_amount DESC) AS decile
  FROM user_purchase_amount
)

SELECT *
  FROM users_with_decile

 

 

 

 

 

 

 

-- 10분할한 Decile들을 집계하는 쿼리
WITH user_purchase_amount AS
(
SELECT
        user_id
        ,SUM(amount) AS purchase_amount
  FROM action_log
 WHERE action = 'purchase'
GROUP BY user_id
)
, users_with_decile AS
(
SELECT 
        user_id
        ,purchase_amount
        ,ntile(10) OVER (ORDER BY purchase_amount DESC) AS decile
  FROM user_purchase_amount
)
, decile_with_purchase_amount AS 
(
SELECT 
        decile
        ,SUM(purchase_amount) AS amount
        ,AVG(purchase_amount) AS avg_amount
        ,SUM(SUM(purchase_amount)) OVER (ORDER BY decile) AS cumulative_amount
        ,SUM(SUM(purchase_amount)) OVER () AS total_amount
  FROM users_with_decile
GROUP BY decile
)


SELECT *
  FROM decile_with_purchase_amount

 

 

 

 

 

-- 사용자별로 RFM을 집계하는 쿼리
WITH purchase_log AS
(
SELECT 
        user_id
        ,amount
        ,substring(stamp,1,10) AS dt
  FROM action_log
 WHERE action = 'purchase'
),
user_rfm AS 
(
SELECT 
        user_id
        ,MAx(dt) AS recent_date
        ,date_diff(CURRENT_DATE, date(timestamp(MAX(dt))), day) AS recency
        ,COUNT(dt) AS frequency
        ,SUM(amount) AS monetary
  FROM purchase_log
GROUP BY user_id
)

SELECT *
  FROM user_rfm
  
  
-- 사용자들의 RFM 랭크를 계산하는 쿼리
WITH user_rfm AS
(
-- 위 코드 참고
),
user_rfm_rank AS
(
SELECT
        user_id
        ,recent_date
        ,recency
        ,frequency
        ,monetary
        ,CASE WHEN recency < 14 THEN 5
              WHEN recency < 28 THEN 4
              WHEN recency < 60 THEN 3
              WHEN recency < 90 THEN 2
              ELSE 1 END AS r
        ,CASE WHEN 20 <= frequency THEN 5
              WHEN 10 <= frequency THEN 4
              WHEN 5 <= frequency THEN 3
              WHEN 2 <= frequency THEN 2
              WHEN 1 = frequency THEN 1 END AS f
        ,CASE WHEN 300000 <= monetary THEN 5
              WHEN 100000 <= frequency THEN 4
              WHEN 300000 <= frequency THEN 3
              WHEN 5000 <= frequency THEN 2
              ELSE 1 END AS m
  FROM user_rfm
)


SELECT *
  FROM user_rfm_rank


-- 각 그룹의 사람 수를 확인하는 쿼리
WITH user_rfm AS
(
-- 위 코드 참고
),
user_rfm_rank AS
(
-- 위 코드 참고
),
mst_rfm_index AS
(
          SELECT 1 AS rfm_index
UNION ALL SELECT 2 AS rfm_index
UNION ALL SELECT 3 AS rfm_index
UNION ALL SELECT 4 AS rfm_index
UNION ALL SELECT 5 AS rfm_index
),
rfm_flag AS (
SELECT 
        m.rfm_index
        ,CASE WHEN m.rfm_index = r.r THEN 1 ELSE 0 END AS r_flag
        ,CASE WHEN m.rfm_index = r.f THEN 1 ELSE 0 END AS f_flag
        ,CASE WHEN m.rfm_index = r.m THEN 1 ELSE 0 END AS m_flag
  FROM mst_rfm_index AS m CROSS JOIN user_rfm_rank AS r
)


SELECT 
        rfm_index
        ,SUM(r_falg) AS r
        ,SUM(f_flag) AS f
        ,SUM(m_flag) AS m
  FROM rfm_flag
GROUP BY rfm_index
ORDER BY rfm_index DESC


-- 통합 랭크를 계산하는 쿼리
WITH user_rfm AS
(
-- 위 코드 참고
),
user_rfm_rank AS
(
-- 위 코드 참고
)

SELECT 
        r+f+m AS total_rank
        ,r,f,m
        ,COUNT(user_id)
  FROM user_rfm_rank
GROUP BY r,f,m
ORDER BY total_rank DESC, r DESC, f DESC, m DESC


-- 종합 랭크별로 사용자 수를 집계하는 쿼리
WITH user_rfm AS
(
-- 위 코드 참고
),
user_rfm_rank AS
(
-- 위 코드 참고
)
SELECT
        r+f+m AS total_rank
        ,COUNT(user_id)
  FROM user_rfm_rank
GROUP BY total_rank
ORDER BY total_rank DESC


-- R과 F를 사영해 2차원 사용자 층의 사용자 수를 집계하는 쿼리 (p.230)
WITH user_rfm AS
(
-- 위 코드 참고
),
user_rfm_rank AS
(
-- 위 코드 참고
)
SELECT 
        CONCAT('r_', r) AS r_rank
        ,COUNT(CASE WHEN f = 5 THEN 1 END) AS f_5
        ,COUNT(CASE WHEN f = 4 THEN 1 END) AS f_4
        ,COUNT(CASE WHEN f = 3 THEN 1 END) AS f_3
        ,COUNT(CASE WHEN f = 2 THEN 1 END) AS f_2
        ,COUNT(CASE WHEN f = 1 THEN 1 END) AS f_1
  FROM user_rfm_rank
GROUP BY r
ORDER BY r_rank DESC

 

 

 

 

 

 

-- 매달 등록 수와 전월비를 계산하는 쿼리
WITH mst_users_with_year_month AS 
(
SELECT 
        *
        ,substring(register_date,1,7) AS year_month
  FROM mst_users
)

SELECT 
        year_month
        ,COUNT(DISTINCT user_id) AS register_count
        ,LAG(COUNT(DISTINCT user_id)) OVER (ORDER BY year_month) AS last_month_count
        ,1.0*COUNT(DISTINCT user_id)/LAG(COUNT(DISTINCT user_id)) OVER(ORDER BY year_month) AS month_over_month_ratio
  FROM mst_users_with_year_month
GROUP BY year_month


-- 디바이스들의 등록 수를 집계하는 쿼리
WITH mst_users_with_year_month AS 
(
-- 위 코드 참고
)
SELECT 
        year_month
        ,COUNT(DISTINCT user_id) AS register_count
        ,COUNT(DISTINCT CASE WHEN register_device = 'pc' THEN user_id END) AS register_pc
        ,COUNT(DISTINCT CASE WHEN register_device = 'sp' THEN user_id END) AS register_sp
        ,COUNT(DISTINCT CASE WHEN register_device = 'app' THEN user_id END) AS register_app
  FROM mst_users_with_year_month
GROUP BY year_month


-- '로그 최근 일자'와 '사용자별 등록일의 다음날'을 계산하는 쿼리
WITH action_log_with_mst_users AS
(
SELECT
        u.user_id
        ,u.register_date
        ,CAST(a.stamp AS date) AS action_date
        ,MAX(CAST(a.stamp AS date)) OVER() AS latest_date
        ,CAST(u.register_date::date + '1 day'::interval AS date) AS next_day_1
  FROM mst_users AS u LEFT OUTER JOIN action_log AS a ON u.user_id = a.user_id
)

 

 

 

WITH action_log_with_mst_users AS
(
SELECT
        u.user_id
        ,u.register_date
        ,CAST(a.stamp AS date) AS action_date
        ,MAX(CAST(a.stamp AS date)) OVER() AS latest_date
        ,CAST(u.register_date::date + '1 day'::interval AS date) AS next_day_1
  FROM mst_users AS u LEFT OUTER JOIN action_log AS a ON u.user_id = a.user_id
)


SELECT *
  FROM action_log_with_mst_users
ORDER BY register_date


-- 사용자의 액션 플래그를 계산하는 쿼리
WITH action_log_with_mst_users AS
(
-- 위 코드 참고
)
, user_action_flag AS
(
SELECT
        user_id
        ,register_date
        ,SIGN(SUM(CASE WHEN next_day_1 <= latest_date THEN 
                    CASE WHEN next_day_1 = action_date THEN 1 ELSE 0 END END)) AS next_1_day_action
  FROM action_log_with_mst_users
GROUP BY user_id, register_date
)

SELECT *
  FROM user_action_flag
ORDER BY register_date, user_id


-- 다음날 지속률을 계산하는 쿼리
WITH action_log_with_mst_users AS
(
-- 위 코드 참고
)
, user_action_flag AS
(
-- 위 코드 참고
)
SELECT 
        register_date
        ,AVG(100.0*next_1_day_action) AS repeat_rate_1_day
  FROM user_action_flag
GROUP BY register_date
ORDER BY register_date


-- 지속률 지표를 관리하는 마스터 테이블을 작성하는 쿼리
WITH repeat_interval(index_name, interval_date) AS
(
VALUES
    ('01 day repeat', 1)
    ,('02 day repeat', 2)
    ,('03 day repeat', 3)
    ,('04 day repeat', 4)
    ,('05 day repeat', 5)
    ,('06 day repeat', 6)
    ,('07 day repeat', 7)
)
SELECT *
  FROM repeat_interval
ORDER BY index_name



-- 지속률을 세로 기반으로 집계하는 쿼리
WITH repeat_interval AS(
-- 위 코드 참고하기
)
,action_log_with_index_date AS
(
SELECT
        u.user_id
        ,u.register_date
        ,CAST(a.stamp AS date) AS action_date
        ,MAX(CAST(a.stamp AS date)) OVER() AS latest_date
        ,r.index_name
        ,CAST(CAST(u.register_date AS date) + interval '1 day' * r.interval_date AS date) AS index_date
  FROM mst_users AS u LEFT OUTER JOIN action_log AS a ON u.user_id = a.user_id
       CROSS JOIN repeat_interval AS r
),
user_action_flag AS
(
SELECT
        user_id
        ,register_date
        ,index_name
        ,SIGN(
            SUM(
                CASE WHEN index_date <= latest_date THEN
                    CASE WHEN index_date = action_date THEN 1 ELSE 0 END END
            )) AS index_date_action
  FROM action_log_with_index_date
GROUP BY user_id, register_date, index_name, index_date
)

SELECT
        register_date
        ,index_name
        ,AVG(100.0*index_date_action) AS repeat_rate
  FROM user_action_flag
GROUP BY register_date, index_name
ORDER BY register_date, index_name



-- 12개월 후까지의 월을 도출하기 위한 보조 테이블을 만드는 쿼리
-- 등록 월에서 12개월 후까지의 잔존율을 집계하는 쿼리
WITH mst_intervals(interval_month) AS
(
VALUES (1), (2), (3), (4), (5), (6)
    ,  (7), (8), (9), (10), (11), (12)
),
mst_users_with_index_month AS
(
SELECT 
        u.user_id
        ,u.register_date
        ,CAST(u.register_date::date + i.interval_month * '1 month' :: interval AS date) AS index_date
        ,substring(u.register_date,1,7) AS register_month
        ,substring(CAST(u.register_date::date + i.interval_month * '1 month'::interval AS text), 1, 7) AS index_month
  FROM mst_users AS u CROSS JOIN mst_intervals AS i 
), 
action_log_in_month AS 
(
SELECT DISTINCT user_id
        ,substring(stamp,1,7) AS action_month
  FROM action_log
)

SELECT
        u.register_month
        ,u.index_month
        ,SUM(CASE WHEN a.action_month IS NOT NULL THEN 1 ELSE 0 END) AS users
        ,AVG(CASE WHEN a.action_month IS NOT NULL THEN 100.0 ELSE 0.0 END) AS retention_rate
  FROM mst_users_with_index_month AS u
    LEFT JOIN 
       action_log_in_month AS a ON u.user_id = a.user_id AND u.index_month = a.action_month
GROUP BY u.register_month, u.index_month
ORDER BY u.register_month, u.index_month




-- 신규 사용자 수, 리피트 사용자 수, 컴백 사용자 수를 집계하는 쿼리
WITH monthly_user_action AS (
-- 월별 사용자 액션 집약하기
SELECT 
        DISTINCT u.user_id
        ,SUBSTRING(u.register_date, 1, 7) AS register_month
        ,SUBSTRING(l.stamp, 1, 7) AS action_month
        ,SUBSTRING(CAST(l.stamp::date - interval '1 month' AS text), 1, 7) AS action_month_priv
  FROM mst_users AS u JOIN action_log AS l ON u.user_id = l.user_id
)
,monthly_user_with_type AS (
-- 월별 사용자 분류 테이블
SELECT 
        action_month
        ,user_id
        ,CASE WHEN register_month = action_month THEN 'new_user' -- 등록 월과 액션월이 일치하면 신규 사용잠
              WHEN action_month_priv = LAG(action_month) -- 이전 월에 액션이 있다면 리피트 사용자
                   OVER(PARTITION BY user_id ORDER BY action_month) THEN 'repeat_user'
              ELSE 'come_back_user' END AS c
        ,action_month_priv
  FROM monthly_user_action
)


SELECT
        action_month
        -- 특정 달의 MAU
        ,COUNT(user_id) AS mau
        ,COUNT(CASE WHEN c = 'new_user'       THEN 1 END) AS new_users
        ,COUNT(CASE WHEN c = 'repeat_user'    THEN 1 END) AS repeat_users
        ,COUNT(CASE WHEN c = 'come_back_user' THEN 1 END) AS come_back_users
  FROM monthly_user_with_type
GROUP BY action_month
ORDER BY action_month

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

'SQL' 카테고리의 다른 글

리트코드 SQL 50  (0) 2024.04.09
LeetCode 문제풀이 (1795)  (0) 2022.07.04
LeetCode 문제풀이 (1393, 1407, 1484, 1581)  (1) 2022.07.01
LeetCode 문제풀이 (626, 1158)  (0) 2022.06.27
LeetCode 문제풀이 (262)  (0) 2022.06.26