I would like to create a linear ancestry listing for a tree breeding project. The parents are male/female pairs that must not be related (no inbreeding), hence the importance to track and visualize these pedigrees...
Below is the test tables/data using Postgresql 9.1:
DROP TABLE if exists family CASCADE;
DROP TABLE if exists plant CASCADE;
CREATE TABLE family (
id serial,
family_key VARCHAR(20) UNIQUE,
female_plant_id INTEGER NOT NULL DEFAULT 1,
male_plant_id INTEGER NOT NULL DEFAULT 1,
filial_n INTEGER NOT NULL DEFAULT -1, -- eg 0,1,2... Which would represent None, F1, F2...
CONSTRAINT family_pk PRIMARY KEY (id)
);
CREATE TABLE plant (
id serial,
plant_key VARCHAR(20) UNIQUE,
id_family INTEGER NOT NULL,
CONSTRAINT plant_pk PRIMARY KEY (id),
CONSTRAINT plant_id_family_fk FOREIGN KEY(id_family) REFERENCES family(id) -- temp may need to remove constraint...
);
-- FAMILY Table DATA:
insert into family (id, family_key, female_plant_id, male_plant_id, filial_n) VALUES (1,'NA',1,1,1); -- Default place holder record
-- Root level Alba families
insert into family (id, family_key, female_plant_id, male_plant_id, filial_n) VALUES (2,'family1AA',2,3,1);
insert into family (id, family_key, female_plant_id, male_plant_id, filial_n) VALUES (3,'family2AA',4,5,1);
insert into family (id, family_key, female_plant_id, male_plant_id, filial_n) VALUES (4,'family3AA',6,7,1);
-- F2 Hybrid Families
insert into family (id, family_key, female_plant_id, male_plant_id, filial_n) VALUES (5,'family4AE',8,11,0);
insert into family (id, family_key, female_plant_id, male_plant_id, filial_n) VALUES (6,'family5AG',9,12,0);
insert into family (id, family_key, female_plant_id, male_plant_id, filial_n) VALUES (7,'family6AT',10,13,0);
-- F3 Double Hybrid family:
insert into family (id, family_key, female_plant_id, male_plant_id, filial_n) VALUES (9,'family7AEAG',14,15,0);
-- F3 Tri-hybrid backcross family:
insert into family (id, family_key, female_plant_id, male_plant_id, filial_n) VALUES (10,'family8AEAGAT',17,16,0);
-- PLANT Table DATA:
-- Root level Alba Parents:
insert into plant (id, plant_key, id_family) VALUES (1,'NA',1); -- Default place holder record
insert into plant (id, plant_key, id_family) VALUES (2,'female1A',1);
insert into plant (id, plant_key, id_family) VALUES (3,'male1A',1);
insert into plant (id, plant_key, id_family) VALUES (4,'female2A',1);
insert into plant (id, plant_key, id_family) VALUES (5,'male2A',1);
insert into plant (id, plant_key, id_family) VALUES (6,'female3A',1);
insert into plant (id, plant_key, id_family) VALUES (7,'male3A',1);
-- Female Alba progeny:
insert into plant (id, plant_key, id_family) VALUES (8,'female4A',2);
insert into plant (id, plant_key, id_family) VALUES (9,'female5A',3);
insert into plant (id, plant_key, id_family) VALUES (10,'female6A',4);
-- Male Aspen Root level parents:
insert into plant (id, plant_key, id_family) VALUES (11,'male1E',1);
insert into plant (id, plant_key, id_family) VALUES (12,'male1G',1);
insert into plant (id, plant_key, id_family) VALUES (13,'female1T',1);
-- F1 Hybrid progeny:
insert into plant (id, plant_key, id_family) VALUES (14,'female1AE',5);
insert into plant (id, plant_key, id_family) VALUES (15,'male1AG',6);
insert into plant (id, plant_key, id_family) VALUES (16,'male1AT',7);
-- Hybrid progeny
insert into plant (id, plant_key, id_family) VALUES (17,'female1AEAG',9);
-- Tri-hybrid backcross progeny:
insert into plant (id, plant_key, id_family) VALUES (18,'female1AEAGAT',10);
insert into plant (id, plant_key, id_family) VALUES (19,'female2AEAGAT',10);
Below is the Recursive query that I derived from the Postgres WITH Queries documentation:
WITH RECURSIVE search_tree(
family_key
, female_plant
, male_plant
, depth
, path
, cycle
) AS (
SELECT
f.family_key
, pf.plant_key
, pm.plant_key
, 1
, ARRAY[ROW(pf.plant_key, pm.plant_key)]
, false
FROM
family f
, plant pf
, plant pm
WHERE
f.female_plant_id = pf.id
AND f.male_plant_id = pm.id
AND f.filial_n = 1 -- Include only F1 families (root level)
AND f.id <> 1 -- omit the default first family record
UNION ALL
SELECT
f.family_key
, pf.plant_key
, pm.plant_key
, st.depth + 1
, path || ROW(pf.plant_key, pm.plant_key)
, ROW(pf.plant_key, pm.plant_key) = ANY(path)
FROM
family f
, plant pf
, plant pm
, search_tree st
WHERE
f.female_plant_id = pf.id
AND f.male_plant_id = pm.id
AND f.family_key = st.family_key
AND pf.plant_key = st.female_plant
AND pm.plant_key = st.male_plant
AND f.filial_n <> 1 -- Include only non-F1 families (non-root levels)
AND NOT cycle
)
SELECT * FROM search_tree;
Below is the desired output:
F1 family1AA=(female1A x male1A) > F2 family4AE=(female4A x male1E) > F3 family7AEAG=(female1AE x male1AG) > F4 family8AEAGAT=(female1AEAG x male1AT)
F1 family2AA=(female2A x male2A) > F2 family5AG=(female5A x male1G) > F3 family7AEAG=(female1AE x male1AG) > F4 family8AEAGAT=(female1AEAG x male1AT)
F1 family3AA=(female3A x male3A) > F2 family6AT=(female6A x female1T) > F3 family8AEAGAT=(female1AEAG x male1AT)
The above Recursive query displays 3 rows with the appropriate F1 parents but the path does not display the downstream families/parents. I would appreciate help to make the recursive output similar to the desired output listed above.
I have adapted the query to what I have understood, not necessarily to what is required :-)
The query starts at the three given families defined by f.id != 1 AND f.filial_n = 1
and recursively expands available children.
On what condition only the last three matches should be selected is beyond my understanding. Perhaps for each starting family the longest chain of anchestors?
WITH RECURSIVE expanded_family AS (
SELECT
f.id,
f.family_key,
pf.id pd_id,
pf.plant_key pf_key,
pf.id_family pf_family,
pm.id pm_id,
pm.plant_key pm_key,
pm.id_family pm_family,
f.filial_n
FROM family f
JOIN plant pf ON f.female_plant_id = pf.id
JOIN plant pm ON f.male_plant_id = pm.id
),
search_tree AS (
SELECT
f.*,
1 depth,
ARRAY[f.family_key::text] path
FROM expanded_family f
WHERE
f.id != 1
AND f.filial_n = 1
UNION ALL
SELECT
f.*,
depth + 1,
path || f.family_key::text
FROM search_tree st
JOIN expanded_family f
ON f.pf_family = st.id
OR f.pm_family = st.id
WHERE
f.id <> 1
)
SELECT
family_key,
depth,
path
FROM search_tree;
The result is:
family_key | depth | path
---------------+-------+-------------------------------------------------
family1AA | 1 | {family1AA}
family2AA | 1 | {family2AA}
family3AA | 1 | {family3AA}
family4AE | 2 | {family1AA,family4AE}
family5AG | 2 | {family2AA,family5AG}
family6AT | 2 | {family3AA,family6AT}
family7AEAG | 3 | {family1AA,family4AE,family7AEAG}
family7AEAG | 3 | {family2AA,family5AG,family7AEAG}
family8AEAGAT | 3 | {family3AA,family6AT,family8AEAGAT}
family8AEAGAT | 4 | {family1AA,family4AE,family7AEAG,family8AEAGAT}
family8AEAGAT | 4 | {family2AA,family5AG,family7AEAG,family8AEAGAT}
Technical stuff:
I have removed the cycle
stuff because for clean data it should not be necessary (IMHO).
expanded_family
can be inlined if some odd performance problem occurs, but for now it makes the recursive query more readable.
EDIT
A slight modification of the query can filter these rows where, for each "root" family (i.e. the ones for which the query started), the longest path exists.
I show only the changed part in search_tree
, so you have to copy the head from the previous section:
-- ...
search_tree AS
(
SELECT
f.*,
f.id family_root, -- remember where the row came from.
1 depth,
ARRAY[f.family_key::text] path
FROM expanded_family f
WHERE
f.id != 1
AND f.filial_n = 1
UNION ALL
SELECT
f.*,
st.family_root, -- propagate the anchestor
depth + 1,
path || f.family_key::text
FROM search_tree st
JOIN expanded_family f
ON f.pf_family = st.id
OR f.pm_family = st.id
WHERE
f.id <> 1
)
SELECT
family_key,
path
FROM
(
SELECT
rank() over (partition by family_root order by depth desc),
family_root,
family_key,
depth,
path
FROM search_tree
) AS ranked
WHERE rank = 1;
The result is:
family_key | path
---------------+-------------------------------------------------
family8AEAGAT | {family1AA,family4AE,family7AEAG,family8AEAGAT}
family8AEAGAT | {family2AA,family5AG,family7AEAG,family8AEAGAT}
family8AEAGAT | {family3AA,family6AT,family8AEAGAT}
(3 rows)
EDIT2
Based on the comments I added a pretty_print
version of the path:
WITH RECURSIVE expanded_family AS (
SELECT
f.id,
pf.id_family pf_family,
pm.id_family pm_family,
f.filial_n,
f.family_key || '=(' || pf.plant_key || ' x ' || pm.plant_key || ')' pretty_print
FROM family f
JOIN plant pf ON f.female_plant_id = pf.id
JOIN plant pm ON f.male_plant_id = pm.id
),
search_tree AS
(
SELECT
f.id,
f.id family_root,
1 depth,
'F1 ' || f.pretty_print path
FROM expanded_family f
WHERE
f.id != 1
AND f.filial_n = 1
UNION ALL
SELECT
f.id,
st.family_root,
st.depth + 1,
st.path || ' -> F' || st.depth+1 || ' ' || f.pretty_print
FROM search_tree st
JOIN expanded_family f
ON f.pf_family = st.id
OR f.pm_family = st.id
WHERE
f.id <> 1
)
SELECT
path
FROM
(
SELECT
rank() over (partition by family_root order by depth desc),
path
FROM search_tree
) AS ranked
WHERE rank = 1;
The result is
path
----------------------------------------------------------------------------------------------------------------------------------------------------------
F1 family1AA=(female1A x male1A) -> F2 family4AE=(female4A x male1E) -> F3 family7AEAG=(female1AE x male1AG) -> F4 family8AEAGAT=(female1AEAG x male1AT)
F1 family2AA=(female2A x male2A) -> F2 family5AG=(female5A x male1G) -> F3 family7AEAG=(female1AE x male1AG) -> F4 family8AEAGAT=(female1AEAG x male1AT)
F1 family3AA=(female3A x male3A) -> F2 family6AT=(female6A x female1T) -> F3 family8AEAGAT=(female1AEAG x male1AT)
(3 rows)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With