|
** PURPOSE: To match information about resident children in A_NATCHILD with their information in A_INDALL
|
|
** CREATED BY: ALITA NANDI
|
|
** CREATED ON: 30-11-2017
|
|
** MODIFIED ON: 08-01-2018
|
|
|
|
clear all
|
|
|
|
global m "Folder Path where UKHLS datafiles are stored"
|
|
|
|
|
|
** Q1. Which variables uniquely identify each row?
|
|
qui use "$m\a_natchild", clear
|
|
duplicates report a_hidp a_pno a_childno
|
|
duplicates report a_hidp a_pno a_lchno
|
|
** A1. Each row is uniquely identified by a_hidp a_pno a_childno
|
|
|
|
|
|
** Q2. Does a_hidp a_pno refer to the parents?
|
|
qui use "$m\a_natchild", clear
|
|
bys a_hidp a_pno: keep if _n==1
|
|
merge 1:1 a_hidp a_pno using "$m\a_indall", keepus(a_dvage)
|
|
// There should not be any observations in a_natchild who are not in a_indall
|
|
assert _m~=1
|
|
// Check age of those identified by a_hidp a_pno in a_natchild is 16+ years
|
|
keep if _m==3
|
|
assert a_dvage>=16
|
|
** A2. Yes
|
|
|
|
|
|
** Q3. Does a_childno refer to the pno of children?
|
|
qui use "$m\a_natchild", clear
|
|
bys a_hidp a_childno: keep if _n==1
|
|
drop a_pno
|
|
rename a_childno a_pno
|
|
merge 1:1 a_hidp a_pno using "$m\a_indall", keepus(a_dvage)
|
|
// There should not be any observations in a_natchild who are not in a_indall
|
|
cap noisily assert _m~=1
|
|
// Check age of those identified by a_hidp a_pno in a_natchild is 16+ years
|
|
keep if _m==3
|
|
cap noisily assert a_dvage<16
|
|
** A3. NO
|
|
|
|
|
|
|
|
|
|
** Checking that a_lchno is the PNO of the resident biological child as the variable description says and can be used to match NATCHILD with INDALL
|
|
** Restricted NATCHILD to only those who are resident children, a_lchno is valid PNO, that is, a_lchno>0 AND there is valid birth information
|
|
|
|
qui use "$m\a_natchild", clear
|
|
merge m:1 a_hidp a_pno using "$m\a_indall", keepus(pidp a_sex)
|
|
drop if _m==2
|
|
drop _m
|
|
// Do fathers and mothers both report natchild info? Yes
|
|
fre a_sex
|
|
|
|
** STEP1: only keep resident children
|
|
keep if a_lchlv==1
|
|
// These variables should be missing for everyone as there are no non-residetn children in this file
|
|
su a_lchsx a_lchdoby a_lchal a_lchyd4
|
|
drop a_lchsx a_lchdoby a_lchal a_lchyd4
|
|
// Do fathers and mothers both report natchild info about resident children? Yes
|
|
fre a_sex
|
|
|
|
** STEP2: drop if a_lchno<0 as in these cases a_lchno cannot be used to match with a_indall. N=76 cases
|
|
drop if a_lchno<0
|
|
fre a_sex
|
|
|
|
** STEP3: Drop observations where all birth variable values are missing
|
|
g validval=0
|
|
foreach var in a_bwtxp a_bwtel a_bwtwk a_bwt a_bwtlb a_bwtoz a_bwtk a_bwtg5 a_brfed a_brfedend a_brfedend2 {
|
|
replace validval=1 if `var'>-8 & `var'<.
|
|
}
|
|
fre validval
|
|
drop if validval==0
|
|
// Do fathers and mothers both report valid birth information about their resident biological children - NO, ONLY MOTHERS DO
|
|
fre a_sex
|
|
|
|
drop a_sex validval
|
|
|
|
|
|
** CHECK1: Did parents report about the same child (a_lchno) more than once? YES, N=136+6+2=144 cases
|
|
duplicates report a_hidp a_pno a_lchno
|
|
bys a_hidp a_pno a_lchno: gen duplicates=_N
|
|
fre duplicates
|
|
|
|
** This is not a problem if the values of birth info variables are copies of each other. If that is the case then we can simply drop one of the observations.
|
|
gen double newid=pidp*100+a_lchno
|
|
bys newid: g dupl=_n
|
|
sort newid dupl
|
|
tsset newid dupl
|
|
g prob=0
|
|
foreach var in a_bwtxp a_bwtel a_bwtwk a_bwt a_bwtlb a_bwtoz a_bwtk a_bwtg5 a_brfed a_brfedend a_brfedend2 {
|
|
replace prob=1 if `var'~=L.`var' & `var'<. & L.`var'<. & `var'>=0 & L.`var'>=0
|
|
}
|
|
bys a_hidp a_pno a_lchno: egen problems=sum(prob)
|
|
ta duplicates problems
|
|
|
|
|
|
// EDIT1: For the cases where multiple observations have identical birth info, only keep one of the observations
|
|
preserve
|
|
keep if problems==0
|
|
drop newid dupl duplicates problems prob
|
|
duplicates report a_hidp a_pno a_lchno
|
|
bys a_hidp a_pno a_lchno: keep if _n==1
|
|
duplicates report a_hidp a_pno a_lchno
|
|
save dataok, replace
|
|
restore
|
|
|
|
// For the cases where multiple observations but NOT identical birth info, we produce a separate dataset and keep all observations.
|
|
// USER NEEDS TO CHECK AND DECIDE WHAT TO DO WITH THESE CASES
|
|
preserve
|
|
keep if problems!=0
|
|
duplicates report a_hidp a_pno a_lchno
|
|
drop newid dupl prob problems duplicates
|
|
order a_hidp a_pno pidp a_childno a_lchlv a_lchno
|
|
generat usercheck=1
|
|
save dataprob, replace
|
|
restore
|
|
|
|
|
|
** CHECK2: If we match information about the mother in DATAOK with the mother information in INDALL these should match
|
|
use dataok, clear
|
|
rename a_pno xa_mnpno
|
|
rename pidp xmpid
|
|
rename a_lchno a_pno
|
|
order a_hidp a_pno xa_mnpno xmpid a_childno
|
|
duplicates report a_hidp a_pno
|
|
merge 1:1 a_hidp a_pno using "$m\a_indall"
|
|
drop if _m==2
|
|
|
|
// There are two natural chidlren in DATAOK who cannot be found in INDALL
|
|
// USER NEEDS TO CHECK AND DECIDE WHAT TO DO WITH THESE CASES
|
|
li pidp a_hidp a_pno xa_mnpno xmpid if _m==1
|
|
|
|
// The mother IDs from the two sources should match. 3 MISMATCHES
|
|
cou if xa_mnpno~=a_mnpno & _m==3
|
|
cou if xmpid~=mpid & _m==3
|
|
li pidp a_hidp a_pno a_mnpno xa_mnpno mpid xmpid if ((xa_mnpno~=a_mnpno)|(xmpid~=mpid)) & _m~=1
|
|
// For these cases, use relationship information in egoalt to decide
|
|
preserve
|
|
use "$m\a_egoalt", clear
|
|
drop *stat
|
|
// NATCHILD says FEMALE with PIDP=884824167 reported birth info about this person.
|
|
// But INDALL does not say this person is the mother.
|
|
// This case has been checked and the conclusion by the DV team is that these two persons are siblings not parent-child. So, natchild information is wrong and this case should be dropped.
|
|
li if pidp==884824167 , nol
|
|
li if apidp==884824167, nol
|
|
// INDALL says mother is 1157447055, NATCHILD says mother is 1157447051
|
|
// This case has been checked and the conclusion by the DV team is that the mother is PIDP=1157447055, PNO=3. So, natchild information is wrong and this case should be dropped.
|
|
li if pidp==1157447059, nol
|
|
li if apidp==1157447059, nol
|
|
// INDALL says mother is 1293863207, NATCHILD says mother is 1293863215
|
|
// This person has two natural mothers ad shown in EGOALT, INDALL reports one of them as the natural mother and NATCHILD shows the other
|
|
// USER NEEDS TO CHECK AND DECIDE WHAT TO DO WITH THESE CASES
|
|
li if pidp==1293863219, nol
|
|
li if apidp==1293863219, nol
|
|
restore
|
|
|
|
|
|
// EDIT2
|
|
use dataok, clear
|
|
drop if pidp==884824167
|
|
drop if pidp==1157447059
|
|
|
|
generat usercheck=0
|
|
replace usercheck=2 if a_hidp==546694163 & a_lchno==10
|
|
replace usercheck=2 if a_hidp==1021175723 & a_lchno==4
|
|
replace usercheck=3 if a_hidp==1293863203 & a_lchno==4
|
|
|
|
fre usercheck
|
|
|
|
order a_hidp a_pno pidp a_childno a_lchlv a_lchno
|
|
save dataok, replace
|
|
|
|
|
|
** CHECK3: All matched cases should be <16 year old. Yes, they do.
|
|
use dataprob, clear
|
|
bys a_hidp a_pno a_lchno: keep if _n==1
|
|
append using dataok
|
|
duplicates report a_hidp a_pno a_lchno
|
|
keep a_hidp a_pno a_lchno
|
|
rename a_pno a_mnpno
|
|
rename a_lchno a_pno
|
|
merge 1:1 a_hidp a_pno using "$m\a_indall", keepus(a_dvage)
|
|
fre a_dvage if _m==3
|
|
|
|
|
|
|
|
** FINAL DATAPREP
|
|
use dataok, clear
|
|
append using dataprob
|
|
lab def usercheck 0 "ok" 1 "duplicates" 2 "check mother information" 3 "Not in INDALL"
|
|
lab val usercheck usercheck
|
|
fre usercheck
|
|
save natchild_resident, replace
|
|
|
|
|
|
|
|
// Clear intermediate files
|
|
erase dataok.dta
|
|
erase dataprob.dta
|
|
|
|
exit
|