Skip to content

Commit 87a750e

Browse files
Merge pull request #2 from MasterAgentAI/claude/add-gender-nationality-uGJQp
Add gender and nationality columns to admission data pipeline
2 parents 9c534e3 + a813171 commit 87a750e

7 files changed

Lines changed: 314 additions & 61 deletions

File tree

cli/main.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -657,6 +657,17 @@ def cmd_stats(args: argparse.Namespace) -> None:
657657
table.add_row(prog_id, str(acc), str(rej), str(wl), str(total), rate)
658658

659659
console.print(table)
660+
661+
# Gender & nationality summary
662+
gender = summary.get("gender_dist", {})
663+
nat = summary.get("nationality_dist", {})
664+
if gender or nat:
665+
gender_str = f"M:{gender.get('M', 0)} F:{gender.get('F', 0)}"
666+
nat_parts = [f"{k}:{v}" for k, v in sorted(nat.items(), key=lambda x: -x[1])]
667+
console.print(
668+
f" [bold]Demographics:[/bold] Gender: {gender_str} | "
669+
f"Nationality: {', '.join(nat_parts)}"
670+
)
660671
console.print()
661672

662673
# GPA distribution

core/admission_data.py

Lines changed: 109 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@
66
77
CSV schema
88
----------
9-
id, bg_type, gpa, gpa_scale, gre, toefl, major, intern_desc,
10-
has_paper, has_research, courses_note, program, result, season, source
9+
id, gender, bg_type, nationality, gpa, gpa_scale, gre, toefl, major,
10+
intern_desc, has_paper, has_research, courses_note, program, result,
11+
season, source
1112
"""
1213

1314
from __future__ import annotations
@@ -165,6 +166,60 @@ def classify_background(bg_type: str) -> int:
165166
return 4 # default
166167

167168

169+
# ---------------------------------------------------------------------------
170+
# Nationality classification
171+
# ---------------------------------------------------------------------------
172+
173+
# Canonical nationality values
174+
NATIONALITY_DOMESTIC = "domestic" # US citizen / permanent resident
175+
NATIONALITY_CHINA = "china" # Chinese mainland
176+
NATIONALITY_HK_TW = "hk_tw" # Hong Kong, Macau, Taiwan
177+
NATIONALITY_OTHER_INTL = "other_intl" # Other international
178+
179+
_NATIONALITY_MAP: dict[str, str] = {
180+
"美籍": NATIONALITY_DOMESTIC,
181+
"美国": NATIONALITY_DOMESTIC,
182+
"us": NATIONALITY_DOMESTIC,
183+
"domestic": NATIONALITY_DOMESTIC,
184+
"greencard": NATIONALITY_DOMESTIC,
185+
"绿卡": NATIONALITY_DOMESTIC,
186+
"pr": NATIONALITY_DOMESTIC,
187+
"中国大陆": NATIONALITY_CHINA,
188+
"中国": NATIONALITY_CHINA,
189+
"大陆": NATIONALITY_CHINA,
190+
"china": NATIONALITY_CHINA,
191+
"mainland": NATIONALITY_CHINA,
192+
"港澳台": NATIONALITY_HK_TW,
193+
"香港": NATIONALITY_HK_TW,
194+
"台湾": NATIONALITY_HK_TW,
195+
"澳门": NATIONALITY_HK_TW,
196+
"hk": NATIONALITY_HK_TW,
197+
"taiwan": NATIONALITY_HK_TW,
198+
}
199+
200+
201+
def classify_nationality(nationality: str) -> str:
202+
"""Map a nationality string to a canonical value.
203+
204+
Returns one of: 'domestic', 'china', 'hk_tw', 'other_intl'.
205+
Empty/unknown values return 'china' (most common in MFE applicant pool).
206+
"""
207+
val = nationality.strip().lower().replace(" ", "")
208+
if not val or val in ("不明", "n/a", "unknown"):
209+
return NATIONALITY_CHINA # default for MFE applicant pool
210+
211+
# Exact match
212+
if val in _NATIONALITY_MAP:
213+
return _NATIONALITY_MAP[val]
214+
215+
# Partial match
216+
for key, canonical in _NATIONALITY_MAP.items():
217+
if key in val or val in key:
218+
return canonical
219+
220+
return NATIONALITY_OTHER_INTL
221+
222+
168223
# ---------------------------------------------------------------------------
169224
# Intern strength scoring
170225
# ---------------------------------------------------------------------------
@@ -230,8 +285,11 @@ class AdmissionRecord:
230285
"""A single real applicant data point with normalized fields."""
231286

232287
id: str = ""
288+
gender: str = "" # M / F / empty
233289
bg_type: str = ""
234290
bg_tier: int = 4 # 1-5, computed from bg_type
291+
nationality: str = "" # raw value
292+
nationality_canonical: str = "" # domestic / china / hk_tw / other_intl
235293
gpa_raw: float = 0.0
236294
gpa_scale: float = 4.0
237295
gpa_normalized: float = 0.0 # on 4.0 scale
@@ -266,6 +324,8 @@ class ProgramStats:
266324
avg_intern_score_accepted: float = 0.0
267325
paper_rate_accepted: float = 0.0
268326
research_rate_accepted: float = 0.0
327+
female_rate_accepted: float = 0.0 # fraction of female among accepted
328+
nationality_dist_accepted: dict[str, int] = field(default_factory=dict)
269329

270330
# Rejected applicant stats
271331
avg_gpa_rejected: float = 0.0
@@ -348,11 +408,15 @@ def load_admission_csv(path: str | Path) -> list[AdmissionRecord]:
348408
gpa_scale = 4.0
349409

350410
bg_type = row.get("bg_type", "").strip()
411+
nationality_raw = row.get("nationality", "").strip()
351412

352413
rec = AdmissionRecord(
353414
id=row.get("id", "").strip(),
415+
gender=row.get("gender", "").strip().upper(),
354416
bg_type=bg_type,
355417
bg_tier=classify_background(bg_type),
418+
nationality=nationality_raw,
419+
nationality_canonical=classify_nationality(nationality_raw),
356420
gpa_raw=gpa_raw,
357421
gpa_scale=gpa_scale,
358422
gpa_normalized=normalize_gpa(gpa_raw, gpa_scale),
@@ -454,6 +518,19 @@ def compute_program_stats(
454518
if research_known
455519
else 0.0
456520
)
521+
# Gender stats
522+
gendered = [r for r in accepted if r.gender in ("M", "F")]
523+
stats.female_rate_accepted = (
524+
sum(1 for r in gendered if r.gender == "F") / len(gendered)
525+
if gendered
526+
else 0.0
527+
)
528+
# Nationality distribution
529+
nat_dist: dict[str, int] = {}
530+
for r in accepted:
531+
nat = r.nationality_canonical or "unknown"
532+
nat_dist[nat] = nat_dist.get(nat, 0) + 1
533+
stats.nationality_dist_accepted = nat_dist
457534

458535
# Rejected stats
459536
if rejected:
@@ -530,6 +607,22 @@ def _effect_size(acc_vals: list[float], rej_vals: list[float]) -> float:
530607
[1.0 if r.has_research else 0.0 for r in rejected if r.has_research is not None],
531608
)
532609

610+
# Gender (female = 1, male = 0)
611+
acc_gender = [1.0 if r.gender == "F" else 0.0 for r in accepted if r.gender in ("M", "F")]
612+
rej_gender = [1.0 if r.gender == "F" else 0.0 for r in rejected if r.gender in ("M", "F")]
613+
features["gender_f"] = _effect_size(acc_gender, rej_gender)
614+
615+
# Nationality (domestic = 1, international = 0)
616+
acc_nat = [
617+
1.0 if r.nationality_canonical == "domestic" else 0.0
618+
for r in accepted if r.nationality_canonical
619+
]
620+
rej_nat = [
621+
1.0 if r.nationality_canonical == "domestic" else 0.0
622+
for r in rejected if r.nationality_canonical
623+
]
624+
features["domestic"] = _effect_size(acc_nat, rej_nat)
625+
533626
return features
534627

535628

@@ -564,6 +657,18 @@ def summarize_records(records: list[AdmissionRecord]) -> dict[str, Any]:
564657
seasons = sorted({r.season for r in records if r.season})
565658
sources = sorted({r.source for r in records if r.source})
566659

660+
# Gender breakdown
661+
gendered = [r for r in records if r.gender in ("M", "F")]
662+
gender_dist = {"M": 0, "F": 0}
663+
for r in gendered:
664+
gender_dist[r.gender] += 1
665+
666+
# Nationality breakdown
667+
nat_dist: dict[str, int] = {}
668+
for r in records:
669+
nat = r.nationality_canonical or "unknown"
670+
nat_dist[nat] = nat_dist.get(nat, 0) + 1
671+
567672
return {
568673
"total_records": len(records),
569674
"unique_applicants": len({r.id for r in records}),
@@ -572,4 +677,6 @@ def summarize_records(records: list[AdmissionRecord]) -> dict[str, Any]:
572677
"sources": sources,
573678
"avg_gpa_normalized": _safe_avg([r.gpa_normalized for r in records]),
574679
"gre_available": sum(1 for r in records if r.gre is not None),
680+
"gender_dist": gender_dist,
681+
"nationality_dist": nat_dist,
575682
}

core/calibrator.py

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -209,28 +209,28 @@ def predict_outcome(
209209
score = 0.0
210210
max_score = 0.0
211211

212-
# GPA component (40%)
213-
weight_gpa = 0.4
212+
# GPA component (35%)
213+
weight_gpa = 0.35
214214
max_score += weight_gpa
215215
if threshold.gpa_target > 0:
216216
gpa_ratio = record.gpa_normalized / threshold.gpa_target
217217
score += weight_gpa * min(1.0, gpa_ratio)
218218

219-
# Background tier (25%)
220-
weight_bg = 0.25
219+
# Background tier (20%)
220+
weight_bg = 0.20
221221
max_score += weight_bg
222222
if threshold.max_bg_tier_accepted > 0:
223223
bg_ratio = 1.0 - (record.bg_tier - 1) / 4.0 # tier 1=1.0, tier 5=0.0
224224
score += weight_bg * max(0.0, bg_ratio)
225225

226-
# Intern score (20%)
227-
weight_intern = 0.2
226+
# Intern score (18%)
227+
weight_intern = 0.18
228228
max_score += weight_intern
229229
if record.intern_score > 0:
230230
score += weight_intern * min(1.0, record.intern_score / 8.0)
231231

232-
# Research/paper bonus (15%)
233-
weight_research = 0.15
232+
# Research/paper bonus (12%)
233+
weight_research = 0.12
234234
max_score += weight_research
235235
bonus = 0.0
236236
if record.has_paper:
@@ -239,6 +239,29 @@ def predict_outcome(
239239
bonus += 0.5
240240
score += weight_research * bonus
241241

242+
# Gender diversity bonus (7%)
243+
# MFE programs skew heavily male; female applicants may benefit
244+
weight_gender = 0.07
245+
max_score += weight_gender
246+
if record.gender == "F":
247+
score += weight_gender * 1.0
248+
elif record.gender == "M":
249+
score += weight_gender * 0.4 # baseline, no penalty
250+
251+
# Nationality / domestic advantage (8%)
252+
# Domestic applicants (US citizens/PR) have slight advantage
253+
weight_nat = 0.08
254+
max_score += weight_nat
255+
nat = record.nationality_canonical
256+
if nat == "domestic":
257+
score += weight_nat * 1.0
258+
elif nat == "hk_tw":
259+
score += weight_nat * 0.6
260+
elif nat == "china":
261+
score += weight_nat * 0.4 # largest applicant pool, most competitive
262+
else:
263+
score += weight_nat * 0.5
264+
242265
# Classify based on score ratio
243266
ratio = score / max_score if max_score > 0 else 0.0
244267

data/admissions/sample.csv

Lines changed: 31 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,31 @@
1-
id,bg_type,gpa,gpa_scale,gre,toefl,major,intern_desc,has_paper,has_research,courses_note,program,result,season,source
2-
1,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,baruch-mfe,accepted,2025Fall,quantnet
3-
2,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,princeton-mfin,rejected,2025Fall,quantnet
4-
3,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,cmu-mscf,accepted,2025Fall,quantnet
5-
4,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,mit-mfin,rejected,2025Fall,quantnet
6-
5,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,uchicago-msfm,accepted,2025Fall,quantnet
7-
6,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,gatech-qcf,accepted,2025Fall,quantnet
8-
7,两财一贸(211),91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,nus-qf,accepted,2025Fall,quantnet
9-
8,985,3.8,4,332,112,数学,2段量化实习+1段券商研究,,,实分析+随机过程+C++,baruch-mfe,accepted,2025Fall,chasedream
10-
9,985,3.8,4,332,112,数学,2段量化实习+1段券商研究,,,实分析+随机过程+C++,princeton-mfin,rejected,2025Fall,chasedream
11-
10,985,3.8,4,332,112,数学,2段量化实习+1段券商研究,,,实分析+随机过程+C++,cmu-mscf,accepted,2025Fall,chasedream
12-
11,海本(Top30),3.92,4,335,,,2段顶级量化+1段投行,,,数学+CS双专业,baruch-mfe,accepted,2025Fall,linkedin
13-
12,海本(Top30),3.92,4,335,,,2段顶级量化+1段投行,,,数学+CS双专业,princeton-mfin,accepted,2025Fall,linkedin
14-
13,海本(Top30),3.92,4,335,,,2段顶级量化+1段投行,,,数学+CS双专业,cmu-mscf,accepted,2025Fall,linkedin
15-
14,985,3.5,4,325,105,金融,1段银行实习,,,,columbia-mafn,rejected,2025Fall,chasedream
16-
15,985,3.5,4,325,105,金融,1段银行实习,,,,gatech-qcf,accepted,2025Fall,chasedream
17-
16,211,3.6,4,328,108,统计,2段数据分析实习,,,时间序列+回归分析,nyu-mfe,rejected,2025Fall,quantnet
18-
17,211,3.6,4,328,108,统计,2段数据分析实习,,,时间序列+回归分析,rutgers-msmf,accepted,2025Fall,quantnet
19-
18,211,3.6,4,328,108,统计,2段数据分析实习,,,时间序列+回归分析,fordham-msqf,accepted,2025Fall,quantnet
20-
19,985,87,100,330,115,金工,3段量化+1段投行,,,随机微积分+实分析+ML,baruch-mfe,accepted,2025Fall,offershow
21-
20,985,87,100,330,115,金工,3段量化+1段投行,,,随机微积分+实分析+ML,nyu-mfe,accepted,2025Fall,offershow
22-
21,海本(Top50),3.7,4,329,,,1段量化实习+1段fintech,,,CS+数学辅修,columbia-mafn,accepted,2025Fall,offershow
23-
22,海本(Top50),3.7,4,329,,,1段量化实习+1段fintech,,,CS+数学辅修,mit-mfin,rejected,2025Fall,offershow
24-
23,双非一本,3.8,4,326,102,应用数学,1段量化实习,,,概率论+线代+微积分,rutgers-msmf,accepted,2025Fall,chasedream
25-
24,双非一本,3.8,4,326,102,应用数学,1段量化实习,,,概率论+线代+微积分,baruch-mfe,rejected,2025Fall,chasedream
26-
25,985,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,,,ML+深度学习+C++,baruch-mfe,accepted,2025Fall,quantnet
27-
26,985,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,,,ML+深度学习+C++,cmu-mscf,accepted,2025Fall,quantnet
28-
27,985,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,,,ML+深度学习+C++,princeton-mfin,waitlisted,2025Fall,quantnet
29-
28,海本(Top10),3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,,,实分析+泛函+随机微积分,princeton-mfin,accepted,2025Fall,linkedin
30-
29,海本(Top10),3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,,,实分析+泛函+随机微积分,baruch-mfe,accepted,2025Fall,linkedin
31-
30,海本(Top10),3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,,,实分析+泛函+随机微积分,mit-mfin,accepted,2025Fall,linkedin
1+
id,gender,bg_type,nationality,gpa,gpa_scale,gre,toefl,major,intern_desc,has_paper,has_research,courses_note,program,result,season,source
2+
1,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,baruch-mfe,accepted,2025Fall,quantnet
3+
2,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,princeton-mfin,rejected,2025Fall,quantnet
4+
3,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,cmu-mscf,accepted,2025Fall,quantnet
5+
4,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,mit-mfin,rejected,2025Fall,quantnet
6+
5,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,uchicago-msfm,accepted,2025Fall,quantnet
7+
6,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,gatech-qcf,accepted,2025Fall,quantnet
8+
7,M,两财一贸(211),中国大陆,91.8,100,331,110+,金工,3段量化私募QR(含top百亿)+三中一华金工组,不明,不明,,nus-qf,accepted,2025Fall,quantnet
9+
8,M,985,中国大陆,3.8,4,332,112,数学,2段量化实习+1段券商研究,,,实分析+随机过程+C++,baruch-mfe,accepted,2025Fall,chasedream
10+
9,M,985,中国大陆,3.8,4,332,112,数学,2段量化实习+1段券商研究,,,实分析+随机过程+C++,princeton-mfin,rejected,2025Fall,chasedream
11+
10,M,985,中国大陆,3.8,4,332,112,数学,2段量化实习+1段券商研究,,,实分析+随机过程+C++,cmu-mscf,accepted,2025Fall,chasedream
12+
11,F,海本(Top30),美籍,3.92,4,335,,,2段顶级量化+1段投行,,,数学+CS双专业,baruch-mfe,accepted,2025Fall,linkedin
13+
12,F,海本(Top30),美籍,3.92,4,335,,,2段顶级量化+1段投行,,,数学+CS双专业,princeton-mfin,accepted,2025Fall,linkedin
14+
13,F,海本(Top30),美籍,3.92,4,335,,,2段顶级量化+1段投行,,,数学+CS双专业,cmu-mscf,accepted,2025Fall,linkedin
15+
14,M,985,中国大陆,3.5,4,325,105,金融,1段银行实习,,,,columbia-mafn,rejected,2025Fall,chasedream
16+
15,M,985,中国大陆,3.5,4,325,105,金融,1段银行实习,,,,gatech-qcf,accepted,2025Fall,chasedream
17+
16,F,211,中国大陆,3.6,4,328,108,统计,2段数据分析实习,,,时间序列+回归分析,nyu-mfe,rejected,2025Fall,quantnet
18+
17,F,211,中国大陆,3.6,4,328,108,统计,2段数据分析实习,,,时间序列+回归分析,rutgers-msmf,accepted,2025Fall,quantnet
19+
18,F,211,中国大陆,3.6,4,328,108,统计,2段数据分析实习,,,时间序列+回归分析,fordham-msqf,accepted,2025Fall,quantnet
20+
19,M,985,中国大陆,87,100,330,115,金工,3段量化+1段投行,,,随机微积分+实分析+ML,baruch-mfe,accepted,2025Fall,offershow
21+
20,M,985,中国大陆,87,100,330,115,金工,3段量化+1段投行,,,随机微积分+实分析+ML,nyu-mfe,accepted,2025Fall,offershow
22+
21,M,海本(Top50),港澳台,3.7,4,329,,,1段量化实习+1段fintech,,,CS+数学辅修,columbia-mafn,accepted,2025Fall,offershow
23+
22,M,海本(Top50),港澳台,3.7,4,329,,,1段量化实习+1段fintech,,,CS+数学辅修,mit-mfin,rejected,2025Fall,offershow
24+
23,M,双非一本,中国大陆,3.8,4,326,102,应用数学,1段量化实习,,,概率论+线代+微积分,rutgers-msmf,accepted,2025Fall,chasedream
25+
24,M,双非一本,中国大陆,3.8,4,326,102,应用数学,1段量化实习,,,概率论+线代+微积分,baruch-mfe,rejected,2025Fall,chasedream
26+
25,M,985,中国大陆,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,,,ML+深度学习+C++,baruch-mfe,accepted,2025Fall,quantnet
27+
26,M,985,中国大陆,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,,,ML+深度学习+C++,cmu-mscf,accepted,2025Fall,quantnet
28+
27,M,985,中国大陆,3.9,4,333,118,计算机,3段量化实习+kaggle金牌,,,ML+深度学习+C++,princeton-mfin,waitlisted,2025Fall,quantnet
29+
28,F,海本(Top10),美籍,3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,,,实分析+泛函+随机微积分,princeton-mfin,accepted,2025Fall,linkedin
30+
29,F,海本(Top10),美籍,3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,,,实分析+泛函+随机微积分,baruch-mfe,accepted,2025Fall,linkedin
31+
30,F,海本(Top10),美籍,3.95,4.3,337,,,数学+金融双专业,2段顶级投行+1段对冲基金,,,实分析+泛函+随机微积分,mit-mfin,accepted,2025Fall,linkedin

data/admissions/template.csv

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
id,bg_type,gpa,gpa_scale,gre,toefl,major,intern_desc,has_paper,has_research,courses_note,program,result,season,source
1+
id,gender,bg_type,nationality,gpa,gpa_scale,gre,toefl,major,intern_desc,has_paper,has_research,courses_note,program,result,season,source

0 commit comments

Comments
 (0)