mckysse.github.io/_bibliography/papers.bib at master · Mckysse/mckysse.github.io · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
---
---

@inproceedings{chen-etal-2026-CoTHLV,
    title = "Decoupling the Effect of Chain-of-Thought Reasoning: A Human Label Variation Perspective",
    author = "Chen, Beiduo  and
      Hu, Tiancheng  and
      Zhang, Caiqi  and
      Korhonen, Anna  and
      Plank, Barbara",
    booktitle = "Findings of the Association for Computational Linguistics: ACL 2026",
    month = jul,
    year = "2026",
    address = "San Diego, California",
    publisher = "Association for Computational Linguistics",
    url = "https://doi.org/10.48550/arXiv.2601.03154",
    doi = "10.48550/ARXIV.2601.03154",
    arxiv={2601.03154},
    code={https://github.com/mainlp/CoT-HLV},
    preview={ACL2026_CoTHLV_preview.png}
}

@inproceedings{hong-etal-2026-Taxonomy,
    title = "Agree, Disagree, Explain: Decomposing Human Label Variation in {NLI} through the Lens of Explanations",
    author = "Hong*, Pingjun  and
      Chen*, Beiduo  and
      Peng, Siyao  and
      de Marneffe, Marie-Catherine  and
      Roth, Benjamin  and
      Plank, Barbara",
    booktitle = "Findings of the Association for Computational Linguistics: ACL 2026",
    month = jul,
    year = "2026",
    address = "San Diego, California",
    publisher = "Association for Computational Linguistics",
    url = "https://doi.org/10.48550/arXiv.2510.16458",
    doi = "10.48550/ARXIV.2510.16458",
    arxiv={2510.16458},
    code={https://github.com/mainlp/LiTEx-NLI-extension},
    preview={ACL2026_Taxonomy_preview.png}
}

@inproceedings{chen-etal-2025-threading,
    title = "(Oral Presentation) Threading the Needle: Reweaving Chain-of-Thought Reasoning to Explain Human Label Variation",
    author = "Chen, Beiduo  and
      Liu, Yang Janet  and
      Korhonen, Anna  and
      Plank, Barbara",
    editor = "Christodoulopoulos, Christos  and
      Chakraborty, Tanmoy  and
      Rose, Carolyn  and
      Peng, Violet",
    booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
    month = nov,
    year = "2025",
    address = "Suzhou, China",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.emnlp-main.1682/",
    doi = "10.18653/v1/2025.emnlp-main.1682",
    pages = "33099--33123",
    ISBN = "979-8-89176-332-6",
    abstract = "The recent rise of reasoning-tuned Large Language Models (LLMs){---}which generate chains of thought (CoTs) before giving the final answer{---}has attracted significant attention and offers new opportunities for gaining insights into human label variation, which refers to plausible differences in how multiple annotators label the same data instance.Prior work has shown that LLM-generated explanations can help align model predictions with human label distributions, but typically adopt a *reverse* paradigm: producing explanations based on given answers. In contrast, CoTs provide a *forward* reasoning path that may implicitly embed rationales for each answer option, before generating the answers. We thus propose a novel LLM-based pipeline enriched with linguistically-grounded discourse segmenters to extract supporting and opposing statements for each answer option from CoTs with improved accuracy. We also propose a rank-based HLV evaluation framework that prioritizes the ranking of answers over exact scores, which instead favor direct comparison of label distributions.Our method outperforms a direct generation method as well as baselines on three datasets, and shows better alignment of ranking methods with humans, highlighting the effectiveness of our approach.",
    arxiv={2505.23368},
    code={https://github.com/mainlp/CoT2EL},
    poster={EMNLP2025_CoT2EL_poster.pdf},
    slides={EMNLP2025_CoT2EL_slides.pdf},
    preview={EMNLP2025_CoT_preview.png}
}


@inproceedings{hong-etal-2025-litex,
    title = "(S{A}{C} Highlights Award) {L}i{TE}x: {A} Linguistic Taxonomy of Explanations for Understanding Within-Label Variation in Natural Language Inference",
    author = "Hong*, Pingjun  and
      Chen*, Beiduo  and
      Peng, Siyao  and
      de Marneffe, Marie-Catherine  and
      Plank, Barbara",
    editor = "Christodoulopoulos, Christos  and
      Chakraborty, Tanmoy  and
      Rose, Carolyn  and
      Peng, Violet",
    booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
    month = nov,
    year = "2025",
    address = "Suzhou, China",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.emnlp-main.1728/",
    doi = "10.18653/v1/2025.emnlp-main.1728",
    pages = "34053--34073",
    ISBN = "979-8-89176-332-6",
    abstract = "There is increasing evidence of Human Label Variation (HLV) in Natural Language Inference (NLI), where annotators assign different labels to the same premise-hypothesis pair. However, *within-label variation* {---} cases where annotators agree on the same label but provide divergent reasoning {---} poses an additional and mostly overlooked challenge. Several NLI datasets contain highlighted words in the NLI item as explanations, but the same spans on the NLI item can be highlighted for different reasons, as evidenced by free-text explanations, which offer a window into annotators' reasoning. To systematically understand this problem and gain insight into the rationales behind NLI labels, we introduce LiTEx, a linguistically-informed taxonomy for categorizing free-text explanations in English. Using this taxonomy, we annotate a subset of the e-SNLI dataset, validate the taxonomy{'}s reliability, and analyze how it aligns with NLI labels, highlights, and explanations. We further assess the taxonomy{'}s usefulness in explanation generation, demonstrating that conditioning generation on LiTEx yields explanations that are linguistically closer to human explanations than those generated using only labels or highlights. Our approach thus not only captures within-label variation but also shows how taxonomy-guided generation for reasoning can bridge the gap between human and model explanations more effectively than existing strategies.",
    arxiv={2505.22848},
    code={https://github.com/mainlp/LiTEx},
    poster={EMNLP2025_LiTEx_poster.pdf},
    slides={EMNLP2025_LiTEx_slides.pdf},
    preview={EMNLP2025_LiTEx_preview.png}
}


@inproceedings{zhao-etal-2025-makieval,
    title = "{MAKIE}val: {A} Multilingual Automatic {W}i{K}idata-based Framework for Cultural Awareness Evaluation for {LLM}s",
    author = "Zhao, Raoyuan  and
      Chen, Beiduo  and
      Plank, Barbara  and
      Hedderich, Michael A.",
    editor = "Christodoulopoulos, Christos  and
      Chakraborty, Tanmoy  and
      Rose, Carolyn  and
      Peng, Violet",
    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
    month = nov,
    year = "2025",
    address = "Suzhou, China",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.findings-emnlp.1256/",
    doi = "10.18653/v1/2025.findings-emnlp.1256",
    pages = "23104--23136",
    ISBN = "979-8-89176-335-7",
    abstract = "Large language models (LLMs) are used globally across many languages, but their English-centric pretraining raises concerns about cross-lingual disparities for cultural awareness, often resulting in biased outputs. However, comprehensive multilingual evaluation remains challenging due to limited benchmarks and questionable translation quality. To better assess these disparities, we introduce MAKIEval, an automatic multilingual framework for evaluating cultural awareness in LLMs across languages, regions, and topics. MAKIEval evaluates open-ended text generation, capturing how models express culturally grounded knowledge in natural language. Leveraging Wikidata{'}s multilingual structure as a cross-lingual anchor, it automatically identifies cultural entities in model outputs and links them to structured knowledge, enabling scalable, language-agnostic evaluation without manual annotation or translation. We then introduce four metrics that capture complementary dimensions of cultural awareness: granularity, diversity, cultural specificity, and consensus across languages. We assess 7 LLMs developed from different parts of the world, encompassing both open-source and proprietary systems, across 13 languages, 19 countries and regions, and 6 culturally salient topics (e.g., food, clothing). Notably, we find that models tend to exhibit stronger cultural awareness in English, suggesting that English prompts more effectively activate culturally grounded knowledge. We publicly release our code and data.",
    arxiv={2505.21693},
    code={https://github.com/mainlp/MAKIEval},
    preview={EMNLP2025_MAKI_preview.png}
}


@inproceedings{chen-etal-2025-rose,
    title = "A Rose by Any Other Name: {LLM}-Generated Explanations Are Good Proxies for Human Explanations to Collect Label Distributions on {NLI}",
    author = "Chen, Beiduo  and
      Peng, Siyao  and
      Korhonen, Anna  and
      Plank, Barbara",
    editor = "Che, Wanxiang  and
      Nabende, Joyce  and
      Shutova, Ekaterina  and
      Pilehvar, Mohammad Taher",
    booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
    month = jul,
    year = "2025",
    address = "Vienna, Austria",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2025.findings-acl.562/",
    doi = "10.18653/v1/2025.findings-acl.562",
    pages = "10777--10802",
    ISBN = "979-8-89176-256-5",
    abstract = "Disagreement in human labeling is ubiquitous, and can be captured in human judgment distributions (HJDs). Recent research has shown that explanations provide valuable information for understanding human label variation (HLV) and large language models (LLMs) can approximate HJD from a few human-provided label-explanation pairs. However, collecting explanations for every label is still time-consuming. This paper examines whether LLMs can be used to replace humans in generating explanations for approximating HJD. Specifically, we use LLMs as annotators to generate model explanations for a few given human labels. We test ways to obtain and combine these label-explanations with the goal to approximate human judgment distributions. We further compare the resulting human with model-generated explanations, and test automatic and human explanation selection. Our experiments show that LLM explanations are promising for NLI: to estimate HJDs, generated explanations yield comparable results to human{'}s when provided with human labels. Importantly, our results generalize from datasets with human explanations to i) datasets where they are not available and ii) challenging out-of-distribution test sets.",
    arxiv={2412.13942},
    code={https://github.com/mainlp/MJD-Estimator},
    poster={ACL2025_ROSE_poster.pdf},
    slides={ACL2025_ROSE_slides.pdf},
    preview={ACL2025_ROSE_preview.png}
}


@inproceedings{chen-etal-2024-seeing,
    title = "``Seeing the Big through the Small'': Can {LLM}s Approximate Human Judgment Distributions on {NLI} from a Few Explanations?",
    author = "Chen, Beiduo  and
      Wang, Xinpeng  and
      Peng, Siyao  and
      Litschko, Robert  and
      Korhonen, Anna  and
      Plank, Barbara",
    editor = "Al-Onaizan, Yaser  and
      Bansal, Mohit  and
      Chen, Yun-Nung",
    booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
    month = nov,
    year = "2024",
    address = "Miami, Florida, USA",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2024.findings-emnlp.842/",
    doi = "10.18653/v1/2024.findings-emnlp.842",
    pages = "14396--14419",
    abstract = "Human label variation (HLV) is a valuable source of information that arises when multiple human annotators provide different labels for valid reasons. In Natural Language Inference (NLI) earlier approaches to capturing HLV involve either collecting annotations from many crowd workers to represent human judgment distribution (HJD) or use expert linguists to provide detailed explanations for their chosen labels. While the former method provides denser HJD information, obtaining it is resource-intensive. In contrast, the latter offers richer textual information but it is challenging to scale up to many human judges. Besides, large language models (LLMs) are increasingly used as evaluators ({``}LLM judges'') but with mixed results, and few works aim to study HJDs. This study proposes to exploit LLMs to approximate HJDs using a small number of expert labels and explanations. Our experiments show that a few explanations significantly improve LLMs' ability to approximate HJDs with and without explicit labels, thereby providing a solution to scale up annotations for HJD. However, fine-tuning smaller soft-label aware models with the LLM-generated model judgment distributions (MJDs) presents partially inconsistent results: while similar in distance, their resulting fine-tuned models and visualized distributions differ substantially. We show the importance of complementing instance-level distance measures with a global-level shape metric and visualization to more effectively evaluate MJDs against human judgment distributions.",
    arxiv={2406.17600},
    code={https://github.com/mainlp/MJD-Estimator},
    poster={EMNLP2024_MJDE_poster.pdf},
    slides={EMNLP2024_MJDE_slides.pdf},
    preview={EMNLP2024_MJDE_preview.png}
}


@inproceedings{chen-etal-2023-pre,
    title = "Pre-training Language Model as a Multi-perspective Course Learner",
    author = "Chen, Beiduo  and
      Huang, Shaohan  and
      Zhang, Zihan  and
      Guo, Wu  and
      Ling, Zhenhua  and
      Huang, Haizhen  and
      Wei, Furu  and
      Deng, Weiwei  and
      Zhang, Qi",
    editor = "Rogers, Anna  and
      Boyd-Graber, Jordan  and
      Okazaki, Naoaki",
    booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
    month = jul,
    year = "2023",
    address = "Toronto, Canada",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2023.findings-acl.9/",
    doi = "10.18653/v1/2023.findings-acl.9",
    pages = "114--128",
    abstract = "ELECTRA, the generator-discriminator pre-training framework, has achieved impressive semantic construction capability among various downstream tasks. Despite the convincing performance, ELECTRA still faces the challenges of monotonous training and deficient interaction. Generator with only masked language modeling (MLM) leads to biased learning and label imbalance for discriminator, decreasing learning efficiency; no explicit feedback loop from discriminator to generator results in the chasm between these two components, underutilizing the course learning. In this study, a multi-perspective course learning (MCL) method is proposed to fetch a many degrees and visual angles for sample-efficient pre-training, and to fully leverage the relationship between generator and discriminator. Concretely, three self-supervision courses are designed to alleviate inherent flaws of MLM and balance the label in a multi-perspective way. Besides, two self-correction courses are proposed to bridge the chasm between the two encoders by creating a ``correction notebook'' for secondary-supervision. Moreover, a course soups trial is conducted to solve the ``tug-of-war'' dynamics problem of MCL, evolving a stronger pre-trained model. Experimental results show that our method significantly improves ELECTRA{'}s average performance by 2.8{\%} and 3.2{\%} absolute points respectively on GLUE and SQuAD 2.0 benchmarks, and overshadows recent advanced ELECTRA-style models under the same settings. The pre-trained MCL model is available at \url{https://huggingface.co/McmanusChen/MCL-base}.",
    arxiv={2305.03981},
    code={https://huggingface.co/McmanusChen/MCL-base},
    poster={ACL2023_MCL_poster.pdf},
    slides={ACL2023_MCL_slides.pdf},
    preview={ACL2023_MCL_preview.png}
}


@inproceedings{ma-etal-2022-wider,
    title = "Wider {\&} Closer: Mixture of Short-channel Distillers for Zero-shot Cross-lingual Named Entity Recognition",
    author = "Ma*, Jun-Yu  and
      Chen*, Beiduo  and
      Gu, Jia-Chen  and
      Ling, Zhenhua  and
      Guo, Wu  and
      Liu, Quan  and
      Chen, Zhigang  and
      Liu, Cong",
    editor = "Goldberg, Yoav  and
      Kozareva, Zornitsa  and
      Zhang, Yue",
    booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
    month = dec,
    year = "2022",
    address = "Abu Dhabi, United Arab Emirates",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.emnlp-main.345/",
    doi = "10.18653/v1/2022.emnlp-main.345",
    pages = "5171--5183",
    abstract = "Zero-shot cross-lingual named entity recognition (NER) aims at transferring knowledge from annotated and rich-resource data in source languages to unlabeled and lean-resource data in target languages. Existing mainstream methods based on the teacher-student distillation framework ignore the rich and complementary information lying in the intermediate layers of pre-trained language models, and domain-invariant information is easily lost during transfer. In this study, a mixture of short-channel distillers (MSD) method is proposed to fully interact the rich hierarchical information in the teacher model and to transfer knowledge to the student model sufficiently and efficiently. Concretely, a multi-channel distillation framework is designed for sufficient information transfer by aggregating multiple distillers as a mixture. Besides, an unsupervised method adopting parallel domain adaptation is proposed to shorten the channels between the teacher and student models to preserve domain-invariant features. Experiments on four datasets across nine languages demonstrate that the proposed method achieves new state-of-the-art performance on zero-shot cross-lingual NER and shows great generalization and compatibility across languages and fields.",
    arxiv={2212.03506},
    code={https://github.com/Mckysse/MSD},
    poster={EMNLP2022_MSD_poster.pdf},
    slides={EMNLP2022_MSD_slides.pdf},
    preview={EMNLP2022_MSD_preview.png}
}


@INPROCEEDINGS{9956721,
  author={Chen, Beiduo and Guo, Wu and Liu, Quan and Tao, Kun},
  booktitle={2022 26th International Conference on Pattern Recognition (ICPR)},
  title={Feature Aggregation in Zero-Shot Cross-Lingual Transfer Using Multilingual BERT},
  year={2022},
  volume={},
  number={},
  pages={1428-1435},
  abstract={Multilingual BERT (mBERT), a language model pre-trained on large multilingual corpora, has impressive zeroshot cross-lingual transfer capabilities and performs surprisingly well on zero-shot POS tagging and Named Entity Recognition (NER), as well as on cross-lingual model transfer. At present, the mainstream methods to solve the cross-lingual downstream tasks are always using the last transformer layer’s output of mBERT as the representation of linguistic information. In this work, we explore the complementary property of lower layers to the last transformer layer of mBERT. A feature aggregation module based on an attention mechanism is proposed to fuse the information contained in different layers of mBERT. The experiments are conducted on four zero-shot cross-lingual transfer datasets, and the proposed method obtains performance improvements on key multilingual benchmark tasks XNLI (+1.5 %), PAWS-X (+2.4 %), NER (+1.2 F1), and POS (+1.5 F1). Through the analysis of the experimental results, we prove that the layers before the last layer of mBERT can provide extra useful information for cross-lingual downstream tasks and explore the interpretability of mBERT empirically.},
  keywords={Fuses;Bit error rate;Tagging;Linguistics;Benchmark testing;Transformers;Pattern recognition},
  doi={10.1109/ICPR56361.2022.9956721},
  ISSN={2831-7475},
  month={Aug},
  arxiv={2205.08497},
  poster={ICPR2022_DLFA_poster.pdf},
  preview={ICPR2022_DLFA_preview.png}
}


@inproceedings{chen-etal-2022-ustc,
    title = "{USTC}-{NELSLIP} at {S}em{E}val-2022 Task 11: Gazetteer-Adapted Integration Network for Multilingual Complex Named Entity Recognition",
    author = "Chen, Beiduo  and
      Ma, Jun-Yu  and
      Qi, Jiajun  and
      Guo, Wu  and
      Ling, Zhen-Hua  and
      Liu, Quan",
    editor = "Emerson, Guy  and
      Schluter, Natalie  and
      Stanovsky, Gabriel  and
      Kumar, Ritesh  and
      Palmer, Alexis  and
      Schneider, Nathan  and
      Singh, Siddharth  and
      Ratan, Shyam",
    booktitle = "Proceedings of the 16th International Workshop on Semantic Evaluation (SemEval-2022)",
    month = jul,
    year = "2022",
    address = "Seattle, United States",
    publisher = "Association for Computational Linguistics",
    url = "https://aclanthology.org/2022.semeval-1.223/",
    doi = "10.18653/v1/2022.semeval-1.223",
    pages = "1613--1622",
    abstract = "This paper describes the system developed by the USTC-NELSLIP team for SemEval-2022 Task 11 Multilingual Complex Named Entity Recognition (MultiCoNER). We propose a gazetteer-adapted integration network (GAIN) to improve the performance of language models for recognizing complex named entities. The method first adapts the representations of gazetteer networks to those of language models by minimizing the KL divergence between them. After adaptation, these two networks are then integrated for backend supervised named entity recognition (NER) training. The proposed method is applied to several state-of-the-art Transformer-based NER models with a gazetteer built from Wikidata, and shows great generalization ability across them. The final predictions are derived from an ensemble of these trained models. Experimental results and detailed analysis verify the effectiveness of the proposed method. The official results show that our system ranked 1st on three tracks (Chinese, Code-mixed and Bangla) and 2nd on the other ten tracks in this task.",
    arxiv={2203.03216},
    code={https://github.com/Mckysse/GAIN},
    poster={SemEval2022_GAIN_poster.pdf},
    slides={SemEval2022_GAIN_slides.pdf},
    preview={SemEval2022_GAIN_preview.png}
}

@INPROCEEDINGS{9747720,
  author={Chen, Beiduo and Guo, Wu and Gu, Bin and Liu, Quan and Wang, Yongchao},
  booktitle={ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  title={Multi-Level Contrastive Learning for Cross-Lingual Alignment},
  year={2022},
  volume={},
  number={},
  pages={7947-7951},
  abstract={Cross-language pre-trained models such as multilingual BERT (mBERT) have achieved significant performance in various cross-lingual downstream NLP tasks. This paper proposes a multi-level contrastive learning (ML-CTL) framework to further improve the cross-lingual ability of pre-trained models. The proposed method uses translated parallel data to encourage the model to generate similar semantic embeddings for different languages. However, unlike the sentence-level alignment used in most previous studies, in this paper, we explicitly integrate the word-level information of each pair of parallel sentences into contrastive learning. Moreover, cross-zero noise contrastive estimation (CZ-NCE) loss is proposed to alleviate the impact of the floating-point error in the training process with a small batch size. The proposed method significantly improves the cross-lingual transfer ability of our basic model (mBERT) and outperforms on multiple zero-shot cross-lingual downstream tasks compared to the same-size models in the Xtreme benchmark.},
  keywords={Training;Conferences;Semantics;Bit error rate;Estimation;Benchmark testing;Signal processing;Cross-language pre-trained model;contrastive learning;multi-level;cross-zero NCE;cross-lingual alignment},
  doi={10.1109/ICASSP43922.2022.9747720},
  ISSN={2379-190X},
  month={May},
  arxiv={2202.13083},
  code={https://github.com/Mckysse/ML-CTL},
  poster={ICASSP2022_ML-CTL_poster.pdf},
  slides={ICASSP2022_ML-CTL_slides.pdf},
  preview={ICASSP2022_ML-CTL_preview.png}
}