统计公司人数

这是一个在sqlservercentral的谜题,要求按月统计每个月加入公司的人数,每个月离开公司的人数和当月公司的总人数。公司的人员加入离开的情况用以下下表来存储:

CREATE TABLE [dbo].[Registrations]

    (

     [Registration_ID] [int] IDENTITY(1, 1)

                             NOT NULL,

     [FirstName] [varchar](80) NOT NULL,

     [LastName] [varchar](80) NOT NULL,

     [DateJoined] [datetime] NOT NULL,

     [DateLeft] [datetime] NULL,

     CONSTRAINT [PK_Registrations] PRIMARY KEY CLUSTERED 

    	([DateJoined], [LastName], [FirstName])

    )

CREATE INDEX idxDateJoined 

    ON Registrations (DateJoined, DateLeft, Registration_ID)

以上表First Name和Last Name表示用户的姓名,DateJoined表示加入公司的时间,DateLeft表示离开公司的时间。具体插入数据的代码可以从此处下载。

先看看按加入时间排序的头十条记录:

Registration_ID FirstName            LastName             DateJoined              DateLeft
--------------- -------------------- -------------------- ----------------------- -----------------------
4               Judith               Palmer               2004-01-01 00:00:00.000 NULL
5               Christa              Jackson              2004-01-01 00:00:00.000 NULL
6               Sophia               Grimes               2004-01-01 00:00:00.000 NULL
7               Tanisha              Delgado              2004-01-01 00:00:00.000 NULL
8               Melisa               Barber               2004-01-01 00:00:00.000 NULL
9               Hannah               Gill                 2004-01-01 00:00:00.000 NULL
10              Dante                Barnes               2004-01-02 00:00:00.000 NULL
11              Rolando              Ford                 2004-01-02 00:00:00.000 NULL
12              Otis                 Anthony              2004-01-02 00:00:00.000 NULL
13              Frankie              Larsen               2004-01-03 00:00:00.000 NULL

可以看出这几个2004年一月加入的人还没有离开公司。最终的结果应该如下所示:

Date                    MonthJoin   MonthLeft   Registrations
----------------------- ----------- ----------- -------------
2004-01-01 00:00:00.000 167         0           167
2004-02-01 00:00:00.000 133         0           300
2004-03-01 00:00:00.000 144         0           444
2004-04-01 00:00:00.000 155         0           599
2004-05-01 00:00:00.000 156         0           755
2004-06-01 00:00:00.000 136         1           890
2004-07-01 00:00:00.000 138         0           1028
2004-08-01 00:00:00.000 132         0           1160
2004-09-01 00:00:00.000 163         0           1323
2004-10-01 00:00:00.000 135         0           1458

。。。。。

首先给出我的一个算法:

DECLARE @sumValue AS INT = 0



CREATE TABLE #Temp(

	Date DATETIME,

	MonthJoin INT,

	MonthLeft INT,

	Registrations INT)

	

;WITH MonthLeft(dt, MJ, ML) AS

(

	SELECT DATEDIFF(MM, 0, r.DateLeft),0, COUNT(*)

	FROM dbo.Registrations r

	WHERE r.DateLeft IS NOT NULL

	GROUP BY DATEDIFF(MM, 0, r.DateLeft)

	

	UNION ALL

	

	SELECT DATEDIFF(MM, 0, r.DateJoined), COUNT(*), 0

	FROM dbo.Registrations r

	GROUP BY DATEDIFF(MM, 0, r.DateJoined)

)



INSERT INTO #Temp(Date, MonthJoin, MonthLeft, Registrations)

SELECT DATEADD(MM, dt, 0), MAX(MJ), MAX(ML), 0

FROM MonthLeft

GROUP BY dt



UPDATE #Temp 

SET @sumValue = Registrations = @sumValue + MonthJoin - MonthLeft

SELECT * FROM #Temp

首先Cursor内部的查询可以查找出每个月公司离开人数,每个月加入公司的人数。每次遍历结果中的记录时,用加入人数减去离开人数得到本月公司的净入人数,然后加上上个月的公司人数就可以得到本月公司的人数。在cursor内部一共会对表进行两次索引查找。

下面给出最佳解决方案(来自所给链接):

SET STATISTICS TIME ON

--DROP TABLE #Stage



CREATE TABLE    #Stage

        (

                theMonth SMALLINT NOT NULL,

                PeopleJoined INT NOT NULL,

                PeopleLeft INT NOT NULL,

                Subscribers INT NOT NULL

        )



INSERT          #Stage

        (

                theMonth,

                PeopleJoined,

                PeopleLeft,

                Subscribers

        )

SELECT          u.theMonth,

        SUM(CASE WHEN u.theCol = 'DateJoined' THEN u.Registrations ELSE 0 END) AS PeopleJoined,

        SUM(CASE WHEN u.theCol = 'DateLeft' THEN u.Registrations ELSE 0 END) AS PeopleLeft,

        0 AS Subscribers

FROM            (

                SELECT          DATEDIFF(MONTH, 0, DateJoined) AS DateJoined,

                                DATEDIFF(MONTH, 0, DateLeft) AS DateLeft,

                                COUNT(*) AS Registrations

                FROM            dbo.Registrations

                GROUP BY        DATEDIFF(MONTH, 0, DateJoined),

                                DATEDIFF(MONTH, 0, DateLeft)

        ) AS d

UNPIVOT         (

                theMonth

                FOR theCol IN (d.DateJoined, d.DateLeft)

        ) AS u

GROUP BY    u.theMonth

HAVING          SUM(CASE WHEN u.theCol = 'DateJoined' THEN u.Registrations ELSE 0 END) > 0



DECLARE @Subscribers INT = 0



;WITH Yak (theMonth, PeopleJoined, PeopleLeft, Subscribers)

AS (

    SELECT              TOP 2147483647

                DATEADD(MONTH, theMonth, 0) AS theMonth,

                PeopleJoined,

                PeopleLeft,

                Subscribers

    FROM                #Stage

    ORDER BY    theMonth

)



UPDATE  Yak

SET @Subscribers = Subscribers = @Subscribers + PeopleJoined - PeopleLeft

OUTPUT  inserted.theMonth,

    inserted.PeopleJoined,

    inserted.PeopleLeft,

    inserted.Subscribers

这个解决方案非常快,只做了一次表的聚集索引扫描,主要使用了UNPIVOT关键字。这个解决方案有太多值得学习的东西。

你可能感兴趣的:(统计)