I have a table with let's say 100 values that I want to classify in four groups based on minimizing the dispersion inside each group.
I have done this succesfully in excel with Solver minimizing the sum of the variance in each group, but I want to do it in SQL and I'm stuck in how to proceed.
The output I want is for each row that has an ID and a Value, get a third column with the proposed classificacion (1,2,3 or 4).
More graphically is basically determining the orange lines:

EDIT: Sample set and desired otuput:
http://sqlfiddle.com/#!9/d3736/1
Okay, I love this question, although I agree that using SQL to solve this is never going to be efficient.
My solution takes a brute-force approach. So basically I do this:
That is a horrible solution, as I am literally throwing around millions of data items to get the answer. It takes around 2-3 minutes to execute for your sample data. I imagine it wouldn't scale well at all, so if your real data is much larger than this then it wouldn't be appropriate at all.
However, I bet this could be optimised to some degree?
Also, I get a different answer to you. Your total variance for your proposed class assignment is 19.67926, but my best answer has a total variance of 18.95997.
Your answer is:
Class Lower_Bound Upper_Bound
1 3.638 5.223
2 5.321 6.683
3 6.951 8.241
4 8.561 10.100
My answer is:
Class Lower_Bound Upper_Bound
1 3.638 4.952
2 5.223 6.547
3 6.604 8.241
4 8.561 10.100
So similar, but some slight movement there.
Now for the horrid script, apologies for this, it isn't optimised at all, and I left all my working outs in...
--Grab the source data
DECLARE @Data TABLE (ID INT, Value NUMERIC(19,6), Class INT);
INSERT INTO @Data
VALUES
(1000, 4.701, 1),
(1001, 5.223, 1),
(1002, 4.335, 1),
(1003, 9.234, 4),
(1004, 8.684, 4),
(1005, 6.507, 2),
(1006, 9.458, 4),
(1007, 9.663, 4),
(1011, 4.259, 1),
(1012, 8.241, 3),
(1013, 5.531, 2),
(1014, 4.434, 1),
(1015, 4.428, 1),
(1016, 8.119, 3),
(1017, 5.696, 2),
(1018, 8.142, 3),
(1019, 4.349, 1),
(1020, 4.315, 1),
(1021, 9.130, 4),
(1023, 9.278, 4),
(1024, 4.251, 1),
(1027, 7.414, 3),
(1028, 9.502, 4),
(1032, 8.561, 4),
(1033, 9.020, 4),
(1034, 5.365, 2),
(1037, 9.343, 4),
(2000, 9.330, 4),
(2001, 7.838, 3),
(2002, 9.806, 4),
(2003, 7.405, 3),
(2004, 9.970, 4),
(2008, 9.702, 4),
(2009, 10.100, 4),
(2010, 7.679, 3),
(2011, 7.180, 3),
(2012, 8.936, 4),
(3000, 7.249, 3),
(3001, 6.547, 2),
(3002, 5.608, 2),
(3003, 5.613, 2),
(3004, 4.473, 1),
(3005, 5.430, 2),
(3007, 5.766, 2),
(3009, 4.466, 1),
(3011, 4.532, 1),
(3012, 4.878, 1),
(3013, 6.388, 2),
(3014, 4.413, 1),
(3015, 4.689, 1),
(3016, 6.683, 2),
(3017, 5.708, 2),
(3018, 5.468, 2),
(3020, 9.797, 4),
(3022, 6.018, 2),
(3027, 4.493, 1),
(3031, 4.381, 1),
(4001, 4.720, 1),
(4002, 4.482, 1),
(4003, 5.631, 2),
(4004, 8.859, 4),
(4005, 4.788, 1),
(4006, 8.573, 4),
(4007, 5.553, 2),
(4008, 6.604, 2),
(4009, 4.394, 1),
(4010, 6.313, 2),
(5000, 4.269, 1),
(5001, 4.162, 1),
(5002, 4.614, 1),
(5003, 4.142, 1),
(5004, 3.975, 1),
(5005, 4.076, 1),
(5007, 4.299, 1),
(5008, 4.219, 1),
(5009, 4.229, 1),
(5010, 4.109, 1),
(5011, 4.086, 1),
(5012, 4.617, 1),
(5013, 5.470, 2),
(5014, 4.366, 1),
(5015, 4.655, 1),
(5017, 4.083, 1),
(5018, 4.261, 1),
(5019, 4.104, 1),
(5020, 4.297, 1),
(5021, 4.426, 1),
(5022, 6.189, 2),
(5023, 4.327, 1),
(5024, 4.380, 1),
(6000, 4.216, 1),
(6001, 7.150, 3),
(6002, 7.321, 3),
(6003, 4.198, 1),
(6004, 4.111, 1),
(6005, 5.321, 2),
(6006, 3.891, 1),
(6007, 7.370, 3),
(6008, 7.417, 3),
(6009, 7.095, 3),
(6010, 7.115, 3),
(6011, 6.005, 2),
(6012, 4.152, 1),
(6013, 5.683, 2),
(6014, 4.952, 1),
(6015, 3.881, 1),
(6016, 5.412, 2),
(6017, 5.405, 2),
(6018, 7.163, 3),
(6019, 4.451, 1),
(6020, 4.150, 1),
(6021, 4.424, 1),
(6022, 7.156, 3),
(6024, 6.242, 2),
(6025, 4.488, 1),
(6026, 5.732, 2),
(6027, 4.390, 1),
(6028, 5.580, 2),
(6029, 6.265, 2),
(6032, 5.493, 2),
(6033, 4.281, 1),
(6034, 4.387, 1),
(7000, 4.300, 1),
(7001, 4.349, 1),
(7002, 4.241, 1),
(7003, 4.213, 1),
(7004, 4.363, 1),
(7005, 4.217, 1),
(7006, 4.213, 1),
(7008, 4.484, 1),
(7009, 4.086, 1),
(7010, 4.072, 1),
(7011, 4.067, 1),
(7012, 4.098, 1),
(7013, 5.838, 2),
(7015, 4.028, 1),
(7016, 3.880, 1),
(7021, 3.797, 1),
(7022, 3.990, 1),
(7023, 4.263, 1),
(7024, 3.968, 1),
(7026, 3.926, 1),
(7030, 4.326, 1),
(7031, 4.158, 1),
(7032, 4.387, 1),
(7033, 4.836, 1),
(7034, 4.282, 1),
(7035, 4.418, 1),
(7036, 4.352, 1),
(7037, 4.267, 1),
(7038, 4.394, 1),
(7039, 4.195, 1),
(7040, 4.367, 1),
(7042, 4.339, 1),
(7043, 4.024, 1),
(7044, 4.398, 1),
(7045, 4.339, 1),
(7046, 4.283, 1),
(7047, 4.422, 1),
(8000, 4.175, 1),
(8001, 4.178, 1),
(8002, 4.256, 1),
(8003, 6.951, 3),
(8004, 4.329, 1),
(8007, 7.603, 3),
(8008, 6.457, 2),
(8011, 7.551, 3),
(8012, 4.361, 1),
(8014, 7.009, 3),
(8015, 4.293, 1),
(8016, 4.131, 1),
(8017, 4.000, 1),
(8019, 3.915, 1),
(8022, 3.731, 1),
(8023, 4.192, 1),
(8024, 4.221, 1),
(8025, 4.212, 1),
(8028, 4.056, 1),
(9001, 4.429, 1),
(9002, 4.432, 1),
(9003, 4.445, 1),
(9004, 3.696, 1),
(9005, 4.269, 1),
(9010, 4.434, 1),
(9011, 3.677, 1),
(9016, 4.440, 1),
(9017, 3.638, 1),
(9018, 4.426, 1);
--Prove we can calculate the total variance from this
WITH Metrics AS (
SELECT
Class,
SUM(Value) AS Class_Total,
COUNT(*) AS Class_Items,
SUM(Value) / COUNT(*) AS Mean_Value
FROM
@Data
GROUP BY
Class),
Variance AS (
SELECT
d.Class,
m.Class_Items,
MIN(d.Value) AS Lower_Bound,
MAX(d.Value) AS Upper_Bound,
SUM(POWER(d.Value - m.Mean_Value, 2)) AS Variance
FROM
@Data d
INNER JOIN Metrics m ON m.Class = d.Class
GROUP BY
d.Class,
m.Class_Items)
SELECT * FROM Variance;
--Brute force the classes
DROP TABLE #class;
DROP TABLE #data;
CREATE TABLE #class (
iteration INT,
threshold_1 INT,
threshold_2 INT,
threshold_3 INT);
--Organise the population into order
CREATE TABLE #data (
data_order INT,
data_value NUMERIC(19,6));
INSERT INTO
#data
SELECT
ROW_NUMBER() OVER (ORDER BY Value),
value
FROM
@Data
DECLARE @max_order INT;
SELECT @max_order = MAX(data_order) FROM #data;
SELECT @max_order;
--Set up the initial iteration
INSERT INTO
#class
SELECT
1,
2,
3,
4;
--Now use recursion to set up every other iteration to test
DROP TABLE #iterations;
WITH recursion AS (
SELECT
iteration,
threshold_1,
threshold_2,
threshold_3
FROM
#class
UNION ALL
SELECT
iteration + 1,
CASE
WHEN threshold_3 < @max_order - 1 OR threshold_2 < @max_order - 2 THEN threshold_1
ELSE threshold_1 + 1
END,
CASE
WHEN threshold_3 < @max_order - 1 THEN threshold_2
WHEN threshold_2 < @max_order - 2 THEN threshold_2 + 1
ELSE threshold_1 + 2
END,
CASE
WHEN threshold_3 < @max_order - 1 THEN threshold_3 + 1
WHEN threshold_2 < @max_order - 2 THEN threshold_2 + 2
ELSE threshold_1 + 3
END
FROM
recursion
WHERE
threshold_1 < @max_order - 3)
SELECT
*,
CONVERT(NUMERIC(19,6), NULL) AS total_variance
INTO
#iterations
FROM
recursion
OPTION (MAXRECURSION 0);
--Now work over the set of iterations, calculating the total variance for each
DECLARE @iteration INT;
SELECT @iteration = ISNULL(MIN(iteration), 1) FROM #iterations WHERE total_variance IS NULL;
WHILE @iteration IS NOT NULL
BEGIN
WITH ClassAssignment AS (
SELECT
d.*,
CASE
WHEN data_order < threshold_1 THEN 1
WHEN data_order < threshold_2 THEN 2
WHEN data_order < threshold_3 THEN 3
ELSE 4
END AS class
FROM
#data d
CROSS JOIN #iterations i
WHERE
i.iteration = @iteration),
Metrics AS (
SELECT
class,
--SUM(data_value) AS class_total,
--COUNT(*) AS class_items,
SUM(data_value) / COUNT(*) AS mean_value
FROM
ClassAssignment
GROUP BY
class),
Variance AS (
SELECT
d.class,
SUM(POWER(d.data_value - m.mean_value, 2)) AS variance
FROM
ClassAssignment d
INNER JOIN Metrics m ON m.class = d.class
GROUP BY
d.class),
TotalVariance AS (
SELECT SUM(Variance) AS total_variance FROM Variance)
UPDATE
i
SET
total_variance = v.total_variance
FROM
#iterations i
CROSS JOIN TotalVariance v
WHERE
iteration = @iteration;
SELECT @iteration = @iteration + 1;
PRINT 'Iteration #' + CONVERT(VARCHAR(50), @iteration);
END;
--Try to do this in a set-based way
WITH ClassAssignment AS (
SELECT
i.iteration,
d.*,
CASE
WHEN data_order < threshold_1 THEN 1
WHEN data_order < threshold_2 THEN 2
WHEN data_order < threshold_3 THEN 3
ELSE 4
END AS class
FROM
#data d
CROSS JOIN #iterations i),
Metrics AS (
SELECT
iteration,
class,
SUM(data_value) / COUNT(*) AS mean_value
FROM
ClassAssignment
GROUP BY
class,
iteration),
Variance AS (
SELECT
d.iteration,
d.class,
SUM(POWER(d.data_value - m.mean_value, 2)) AS variance
FROM
ClassAssignment d
INNER JOIN Metrics m ON m.iteration = d.iteration AND m.class = d.class
GROUP BY
d.iteration,
d.class),
TotalVariance AS (
SELECT
iteration,
SUM(Variance) AS total_variance
FROM
Variance
GROUP BY
iteration)
UPDATE
i
SET
total_variance = v.total_variance
FROM
#iterations i
INNER JOIN TotalVariance v ON v.iteration = i.iteration;
--Results
SELECT TOP 1 * FROM #iterations WHERE total_variance IS NOT NULL ORDER BY total_variance;
--Best result is 116, 147, 170, with a total variance of 18.959971
--What is this in terms of numbers?
--Class #1
SELECT * FROM #data WHERE data_order = 1;
SELECT * FROM #data WHERE data_order = 115;
--Class #2
SELECT * FROM #data WHERE data_order = 116;
SELECT * FROM #data WHERE data_order = 146;
--Class #3
SELECT * FROM #data WHERE data_order = 147;
SELECT * FROM #data WHERE data_order = 169;
--Class #4
SELECT * FROM #data WHERE data_order = 170;
SELECT * FROM #data WHERE data_order = 188;
Brute-force solution in one Update:
--Sample data
DECLARE @Data TABLE (ID int, Value float, Classification int);
INSERT INTO @Data (ID, Value)
VALUES
(1000, 4.701), (1001, 5.223), (1002, 4.335), (1003, 9.234), (1004, 8.684), (1005, 6.507), (1006, 9.458), (1007, 9.663),
(1011, 4.259), (1012, 8.241), (1013, 5.531), (1014, 4.434), (1015, 4.428), (1016, 8.119), (1017, 5.696), (1018, 8.142),
(1019, 4.349), (1020, 4.315), (1021, 9.130), (1023, 9.278), (1024, 4.251), (1027, 7.414), (1028, 9.502), (1032, 8.561),
(1033, 9.020), (1034, 5.365), (1037, 9.343), (2000, 9.330), (2001, 7.838), (2002, 9.806), (2003, 7.405), (2004, 9.970),
(2008, 9.702), (2009, 10.10), (2010, 7.679), (2011, 7.180), (2012, 8.936), (3000, 7.249), (3001, 6.547), (3002, 5.608),
(3003, 5.613), (3004, 4.473), (3005, 5.430), (3007, 5.766), (3009, 4.466), (3011, 4.532), (3012, 4.878), (3013, 6.388),
(3014, 4.413), (3015, 4.689), (3016, 6.683), (3017, 5.708), (3018, 5.468), (3020, 9.797), (3022, 6.018), (3027, 4.493),
(3031, 4.381), (4001, 4.720), (4002, 4.482), (4003, 5.631), (4004, 8.859), (4005, 4.788), (4006, 8.573), (4007, 5.553),
(4008, 6.604), (4009, 4.394), (4010, 6.313), (5000, 4.269), (5001, 4.162), (5002, 4.614), (5003, 4.142), (5004, 3.975),
(5005, 4.076), (5007, 4.299), (5008, 4.219), (5009, 4.229), (5010, 4.109), (5011, 4.086), (5012, 4.617), (5013, 5.470),
(5014, 4.366), (5015, 4.655), (5017, 4.083), (5018, 4.261), (5019, 4.104), (5020, 4.297), (5021, 4.426), (5022, 6.189),
(5023, 4.327), (5024, 4.380), (6000, 4.216), (6001, 7.150), (6002, 7.321), (6003, 4.198), (6004, 4.111), (6005, 5.321),
(6006, 3.891), (6007, 7.370), (6008, 7.417), (6009, 7.095), (6010, 7.115), (6011, 6.005), (6012, 4.152), (6013, 5.683),
(6014, 4.952), (6015, 3.881), (6016, 5.412), (6017, 5.405), (6018, 7.163), (6019, 4.451), (6020, 4.150), (6021, 4.424),
(6022, 7.156), (6024, 6.242), (6025, 4.488), (6026, 5.732), (6027, 4.390), (6028, 5.580), (6029, 6.265), (6032, 5.493),
(6033, 4.281), (6034, 4.387), (7000, 4.300), (7001, 4.349), (7002, 4.241), (7003, 4.213), (7004, 4.363), (7005, 4.217),
(7006, 4.213), (7008, 4.484), (7009, 4.086), (7010, 4.072), (7011, 4.067), (7012, 4.098), (7013, 5.838), (7015, 4.028),
(7016, 3.880), (7021, 3.797), (7022, 3.990), (7023, 4.263), (7024, 3.968), (7026, 3.926), (7030, 4.326), (7031, 4.158),
(7032, 4.387), (7033, 4.836), (7034, 4.282), (7035, 4.418), (7036, 4.352), (7037, 4.267), (7038, 4.394), (7039, 4.195),
(7040, 4.367), (7042, 4.339), (7043, 4.024), (7044, 4.398), (7045, 4.339), (7046, 4.283), (7047, 4.422), (8000, 4.175),
(8001, 4.178), (8002, 4.256), (8003, 6.951), (8004, 4.329), (8007, 7.603), (8008, 6.457), (8011, 7.551), (8012, 4.361),
(8014, 7.009), (8015, 4.293), (8016, 4.131), (8017, 4.000), (8019, 3.915), (8022, 3.731), (8023, 4.192), (8024, 4.221),
(8025, 4.212), (8028, 4.056), (9001, 4.429), (9002, 4.432), (9003, 4.445), (9004, 3.696), (9005, 4.269), (9010, 4.434),
(9011, 3.677), (9016, 4.440), (9017, 3.638), (9018, 4.426)
;
-- Average distance classification
With Borders As ( --< All possible combinations of borders
Select B0, B1.Value As B1, B2.Value As B2, B3.Value As B3, B4
From (Select MIN(Value)-1 As B0, MAX(Value) As B4 From @Data) B0
Inner Join @Data B1 On B1.Value > B0.B0
Inner Join @Data B2 On B2.Value > B1.Value
Inner Join @Data B3 On B3.Value > B2.Value AND B3.Value < B0.B4
), Averages As ( --< All combinations with group average value calculated
Select B0, B1, B2, B3, B4, AVG(B1.Value) As A1, AVG(B2.Value) As A2, AVG(B3.Value) As A3, AVG(B4.Value) As A4
From Borders
Inner Join @Data B1 On B1.Value > Borders.B0 AND B1.Value <= Borders.B1
Inner Join @Data B2 On B2.Value > Borders.B1 AND B2.Value <= Borders.B2
Inner Join @Data B3 On B3.Value > Borders.B2 AND B3.Value <= Borders.B3
Inner Join @Data B4 On B4.Value > Borders.B3 AND B4.Value <= Borders.B4
Group By B0, B1, B2, B3, B4
), Groups As ( --< All group with average deviations
Select B0, B1, B2, B3, B4, Sum(Abs(A1 - B1.Value)) + Sum(Abs(A2 - B2.Value)) + Sum(Abs(A3 - B3.Value)) + Sum(Abs(A4 - B4.Value)) As TotalDistance
From Averages
Inner Join @Data B1 On B1.Value > Averages.B0 AND B1.Value <= Averages.B1
Inner Join @Data B2 On B2.Value > Averages.B1 AND B2.Value <= Averages.B2
Inner Join @Data B3 On B3.Value > Averages.B2 AND B3.Value <= Averages.B3
Inner Join @Data B4 On B4.Value > Averages.B3 AND B4.Value <= Averages.B4
Group By B0, B1, B2, B3, B4
), Minimal As ( --< Group with minimal TotalDistance
Select Top 1 * From Groups Order By TotalDistance ASC
)
Update Tgt
Set Tgt.Classification = CASE
WHEN Value > B0 AND Value <= B1 THEN 1
WHEN Value > B1 AND Value <= B2 THEN 2
WHEN Value > B2 AND Value <= B3 THEN 3
WHEN Value > B3 AND Value <= B4 THEN 4
END
From @Data Tgt, Minimal
;
-- Full results
Select * From @Data;
-- Each group average for comparison
Select Tgt.Classification, Avg(Abs(Tgt.Value - Ref.Value)) As AverageDistance
From @Data Tgt
Inner Join (Select Classification, Avg(Value) As Value From @Data Group By Classification) Ref On Ref.Classification = Tgt.Classification
Group By Tgt.Classification;
-- Standard Deviation classification
With Borders As ( --< All possible combinations of borders
Select B0, B1.Value As B1, B2.Value As B2, B3.Value As B3, B4
From (Select MIN(Value)-1 As B0, MAX(Value) As B4 From @Data) B0
Inner Join @Data B1 On B1.Value > B0.B0
Inner Join @Data B2 On B2.Value > B1.Value
Inner Join @Data B3 On B3.Value > B2.Value AND B3.Value < B0.B4
), Minimal As ( --< Group with minimal Total Standard Deviation
Select Top 1 B0, B1, B2, B3, B4, STDEV(B1.Value) + STDEV(B2.Value) + STDEV(B3.Value) + STDEV(B4.Value) As TotalSTDEV
From Borders
Inner Join @Data B1 On B1.Value > Borders.B0 AND B1.Value <= Borders.B1
Inner Join @Data B2 On B2.Value > Borders.B1 AND B2.Value <= Borders.B2
Inner Join @Data B3 On B3.Value > Borders.B2 AND B3.Value <= Borders.B3
Inner Join @Data B4 On B4.Value > Borders.B3 AND B4.Value <= Borders.B4
Group By B0, B1, B2, B3, B4
Order By TotalSTDEV ASC
)
Update Tgt
Set Tgt.Classification = CASE
WHEN Value > B0 AND Value <= B1 THEN 1
WHEN Value > B1 AND Value <= B2 THEN 2
WHEN Value > B2 AND Value <= B3 THEN 3
WHEN Value > B3 AND Value <= B4 THEN 4
END
From @Data Tgt, Minimal
;
-- Full results
Select * From @Data;
-- Each group Standard Deviation for comparison
Select Classification, STDEV(Value) As StandardDeviation
From @Data
Group By Classification;
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With