PSM学习感悟

整体流程

import psmatching.match as psm

#文件路径,文件需要是csv格式,列命名时需注意:干预组的列名为 CASE ,每个样本id的列名为 OPTUM_LAB_ID,且为数字,自变量最好是英文名
path="" 
#模型:干预组名称(均改为 CASE )~自变量1 + 自变量2 + 自变量3 ,这里加号的前后的空格不能丢
model = "CASE ~ AGE + TOTAL_YRS" #例子
#为每个干预样本匹配k个对照组样本,        k的数量和自变量的个数可能有大小的关系,使用时需进一步了解
k = "3"
#初始化模型
m = psm.PSMatch(path, model, k)
#为所有样本计算倾向得分,为匹配做准备,该流程结束后,m.df中的每一个样本获取了自己的倾向得分
m.prepare_data()
#查看样本的倾向得分
m.df
#对每个执行干预组样本根据得分进行匹配
m.match(caliper = None, replace = False)
#上述结束后,可通过m.matches查看为每个干预组样本的匹配到的对照组id是什么,也可通过m.matched_data查看干预组和新的对照组的集合
m.matches
m.matched_data
#通过新的集合,通过卡方检验查看自变量和干预组是否是独立的,这里目前有个疑问是,为什么p值小于0.05反而不通过
m.evaluate()

 mathched_data匹配得分后的表格

只剩1772行(原表1.5w行),是因为这个表格只保留了干预的样本(443)和干预匹配好的样本(443x3)=1772

PSM学习感悟_第1张图片

 

 matches的匹配结果


同样是一个表格,有着干预样本的行数,匹配k个样本的列数,所以总共为干预样本行数x(k+1)的表格

PSM学习感悟_第2张图片

 包底层拆解

prepare_data

    def prepare_data(self, **kwargs):
        '''
        Prepares the data for matching.

        Parameters
        ----------
        path : string
            The file path of the data to be analyzed. Assumed to be in .csv format.

        Returns
        -------
        A Pandas DataFrame containing raw data plus a column of propensity scores.
        包含原始数据加一列倾向性得分的pd表
        '''
        # Read in the data specified by the file path
        df = pd.read_csv(self.path)
        #用户id重新设置为表格的索引
        df = df.set_index("OPTUM_LAB_ID")
        # Obtain propensity scores and add them to the data
        print("\nCalculating propensity scores ...", end = " ")
        #倾向性得分通过get_propensity_scores函数获得,get_propensity_scores函数在utilities包中
        propensity_scores = get_propensity_scores(model = self.model, data = df, verbose = False)
        print("DONE!")
        print("Preparing data ...", end = " ")
        df["PROPENSITY"] = propensity_scores
        # Assign the df attribute to the Match object 将 df 属性分配给 Match 对象
        self.df = df
        print("DONE!")

match

   def match(self, caliper = None, replace = False, case_column = 'CASE', **kwargs):
        '''
        Performs propensity score matching.
        执行倾向性得分匹配
        Parameters
        ----------
        df : Pandas DataFrame
            the attribute returned by the prepare_data() function

        Returns
        -------
        matches : Pandas DataFrame
            the Match object attribute describing which control IDs are matched 
            to a particular treatment case.
            
            返回一个匹配对象的属性,它描述针对每一个特定的干预项,给它匹配到了哪个对照组的id
            简单说:返回一个表:第一列是干预组id,后边k列是为它匹配到的对照组id
        matched_data: Pandas DataFrame
            the Match object attribute containing the raw data for only treatment
            cases and their matched controls.
            返回一个匹配对象的属性,仅包含干预项和为其匹配的实验组的原始数据
            简单说:返回初始表中干预组明细和干预组匹配到的对照组的明细
        '''
        # Assert that the Match object has a df attribute
        #hasattr内置函数,用于判断对象是否包含对应的属性, 用法hasattr(object, name) raise 引发异常
        if not hasattr(self, 'df'):
            raise AttributeError("%s does not have a 'df' attribute." % (self))

        # Assign treatment group membership
        #分配干预组成员,此函数开始时有传了一个case的列名(是实战中需要看的干预组列名),此刻赋值到groups里边
        groups = self.df[case_column]
        #把得分赋值给propensity
        propensity = self.df.PROPENSITY
        #令groups获取原表中哪些是干预组(为true),哪些不是(为False)
        
        groups = groups == np.sort(groups.unique())[1]
        n = len(groups)
        n1 = groups[groups==1].sum()
        n2 = n-n1
        g1, g2 = propensity[groups==1], propensity[groups==0]

        if n1 > n2:
            n1, n2, g1, g2 = n2, n1, g2, g1

        # Randomly permute the treatment case IDs
        m_order = list(np.random.permutation(groups[groups==1].index))
        matches = {}
        k = int(self.k)

        # Match treatment cases to controls based on propensity score differences基于倾向性得分来匹配实验组
        print("\nMatching [" + str(k) + "] controls to each case ... ", end = " ")
        for m in m_order:
            # Calculate all propensity score differences
            dist = abs(g1[m]-g2)
            array = np.array(dist)
            # Choose the k smallest differences np.partition工作流程可以看做是先对数组排序(升序),然后以索引是i的元素为基准,将元素分成两部分,即大于该元素的放在其后面,小于该元素的放在其前面,这里有点类似于快排
            k_smallest = np.partition(array, k)[:k].tolist()
            if caliper:
                caliper = float(caliper)
                keep_diffs = [i for i in k_smallest if i <= caliper]
                keep_ids = np.array(dist[dist.isin(keep_diffs)].index)
            else:
                keep_ids = np.array(dist[dist.isin(k_smallest)].index)

            # Break ties via random choice, if ties are present
            if len(keep_ids) > k:
                matches[m] = list(np.random.choice(keep_ids, k, replace=False))
            elif len(keep_ids) < k:
                while len(matches[m]) <= k:
                    matches[m].append("NA")
            else:
                matches[m] = keep_ids.tolist()

            # Matches are made without replacement
            if not replace:
                g2 = g2.drop(matches[m])

        # Prettify the results by consolidating into a DataFrame
        matches = pd.DataFrame.from_dict(matches, orient="index")
        matches = matches.reset_index()
        column_names = {}
        column_names["index"] = "CASE_ID"
        for i in range(k):
            column_names[i] = str("CONTROL_MATCH_" + str(i+1))
        matches = matches.rename(columns = column_names)

        # Extract data only for treated cases and matched controls
        matched_data = get_matched_data(matches, self.df)
        print("DONE!")
        write_matched_data(self.path, self.df)

        # Assign the matches and matched_data attributes to the Match object
        self.matches = matches
        self.matched_data = matched_data

你可能感兴趣的:(统计方法类,学习,python)