Publications

My research focuses on reinforcement learning for robotics and the capabilities and security of Large Language Models (LLMs). My current research directions include reinforcement learning algorithms for robotic manipulation, responsible AI, LLM post-training, development of reliable LLM-powered applications, and AI-enhanced educational technology.

2025
Leveraging Generative AI in Designing and Delivering Individualized Responsive Feedback for Pre-service Teachers in Higher EducationSpringer
Jingwen He, Tingting Li, Zexin Xu, Kui Xie
@Artificial Intelligence and Human Agency in Education: Volume Two: AI for Equity, Well-Being, and Innovation in Teaching and Learning
PDF
@Inbook{He2025,
  author = {Jingwen He and Tingting Li and Zexin Xu and Kui Xie},
  editor = {Adarkwah, Michael Agyemang and Amponsah, Samuel and Huang, Ronghuai and Thomas, Michael},
  title = {Leveraging Generative AI in Designing and Delivering Individualized Responsive Feedback for Pre-service Teachers in Higher Education},
  bookTitle = {Artificial Intelligence and Human Agency in Education: Volume Two: AI for Equity, Well-Being, and Innovation in Teaching and Learning},
  year = {2025},
  publisher = {Springer Nature Singapore},
  address = {Singapore},
  pages = {267--293},
  abstract = {This book chapter investigates the integration of generative AI, specifically ChatGPT, in delivering individualized responsive feedback for pre-service teachers. Through a comprehensive literature review and the development of an initial conceptual framework, we explored the practical application of prompt engineering in two detailed case studies. This process led to the formulation of an improved theoretical framework that incorporates student input, the creation of structured prompts, iterative self-reflection processes, and the generation of individualized responsive feedback. The study's feedback was task-specific, personalized to individual assignments where students reflected on how their identities, worldviews, and cognitive biases shaped their learning and future teaching practices. Although the exact prompts may vary with different educational tasks, the underlying structure and prompt tuning procedures are versatile and transferable across various contexts and subject areas. This flexibility underscores the framework's applicability to diverse educational settings, ensuring relevance, and adaptability. The findings emphasize the essential roles of students, researchers, and particularly instructors in guiding ChatGPT to produce feedback that is not only informative and relevant but also empathetic and personalized. This study contributes significantly to the field of educational technology by presenting a robust framework that combines advanced AI capabilities with critical human oversight, facilitating the delivery of high-quality, individualized feedback in higher education.},
  isbn = {978-981-96-9251-4},
  doi = {10.1007/978-981-96-9251-4_11},
  url = {https://doi.org/10.1007/978-981-96-9251-4_11}
}
Efficiency Robustness of Dynamic Deep Learning SystemsUSENIX Security
Ravishka Rathnasuriya, Zexin Xu, Tingxi Li, Zihe Song, Mirazul Haque, Simin Chen, Wei Yang
@USENIX Security '25
PDFCODE
@misc{rathnasuriya2025efficiencyrobustnessdynamicdeep,
  title = {Efficiency Robustness of Dynamic Deep Learning Systems},
  author = {Ravishka Rathnasuriya and Zexin Xu and Tingxi Li and Zihe Song and Mirazul Haque and Simin Chen and Wei Yang},
  year = {2025},
  eprint = {2506.10831},
  archivePrefix = {arXiv},
  primaryClass = {cs.LG},
  url = {https://arxiv.org/abs/2506.10831},
  booktitle = {USENIX Security '25}
}
COMET: Closed-loop orchestration for malicious elicitation techniques in code modelsAmazon
Zexin Xu, Tingxi Li, Ravishka Shemal Rathnasuriya, Zihe Song, Jun Ren, Bhavesh Mandalapu, Soroush Setayeshpour, Xinya Du, Wei Yang
@Amazon Nova AI Challenge Proceedings
PDF
@Article{Dallas2025,
  author = {Zexin Xu and Tingxi Li and Ravishka Shemal Rathnasuriya and Zihe Song and Jun Ren and Bhavesh Mandalapu and Soroush Setayeshpour and Xinya Du and Wei Yang},
  title = {COMET: Closed-loop orchestration for malicious elicitation techniques in code models},
  year = {2025},
  url = {https://www.amazon.science/nova-ai-challenge/proceedings/comet-closed-loop-orchestration-for-malicious-elicitation-techniques-in-code-models},
  booktitle = {Amazon Nova AI Challenge Proceedings}
}
LLM4SR: A Survey on Large Language Models for Scientific ResearcharXiv
Ziming Luo, Zonglin Yang, Zexin Xu, Wei Yang, Xinya Du
@arXiv preprint
PDF
@misc{luo2025llm4srsurveylargelanguage,
  title = {LLM4SR: A Survey on Large Language Models for Scientific Research},
  author = {Ziming Luo and Zonglin Yang and Zexin Xu and Wei Yang and Xinya Du},
  year = {2025},
  eprint = {2501.04306},
  archivePrefix = {arXiv},
  primaryClass = {cs.CL},
  url = {https://arxiv.org/abs/2501.04306},
  booktitle = {arXiv preprint}
}
Beyond Pass or Fail: Multi-Dimensional Benchmarking of Foundation Models for Goal-based Mobile UI NavigationarXiv
Dezhi Ran, Mengzhou Wu, Hao Yu, Yuetong Li, Jun Ren, Yuan Cao, Xia Zeng, Haochuan Lu, Zexin Xu, Mengqian Xu, Ting Su, Liangchao Yao, Ting Xiong, Wei Yang, Yuetang Deng, Assaf Marron, David Harel, Tao Xie
@arXiv preprint
PDF
@misc{ran2025passfailmultidimensionalbenchmarking,
  title = {Beyond Pass or Fail: Multi-Dimensional Benchmarking of Foundation Models for Goal-based Mobile UI Navigation},
  author = {Dezhi Ran and Mengzhou Wu and Hao Yu and Yuetong Li and Jun Ren and Yuan Cao and Xia Zeng and Haochuan Lu and Zexin Xu and Mengqian Xu and Ting Su and Liangchao Yao and Ting Xiong and Wei Yang and Yuetang Deng and Assaf Marron and David Harel and Tao Xie},
  year = {2025},
  eprint = {2501.02863},
  archivePrefix = {arXiv},
  primaryClass = {cs.SE},
  url = {https://arxiv.org/abs/2501.02863},
  booktitle = {arXiv preprint}
}
2023
Towards a robust and generalizable embodied agentAmazon
Chan Hee Song, Jiaman Wu, Ju-Seung Byun, Zexin Xu, Vardaan Pahuja, Goonmeet Bajaj, Samuel Stevens, Ziru Chen, Yu Su
@Alexa Prize SimBot Challenge Proceedings
PDF
@Inproceedings{SalsaBotAmazon,
  author = {Song, Chan Hee and Wu, Jiaman and Byun, Ju-Seung, and Xu, Zexin and Pahuja, Vardaan and Bajaj, Goonmeet and Stevens, Samuel and Chen, Ziru and Su, Yu},
  title = {Towards a robust and generalizable embodied agent},
  year = {2023},
  url = {https://www.amazon.science/alexa-prize/proceedings/towards-a-robust-and-generalizable-embodied-agent},
  booktitle = {Alexa Prize SimBot Challenge Proceedings}
}
Towards a robust and generalizable embodied agentCVPR
Chan Hee Song, Jiaman Wu, Ju-Seung Byun, Zexin Xu, Vardaan Pahuja, Goonmeet Bajaj, Samuel Stevens, Ziru Chen, Yu Su
@Embodied AI Workshop at CVPR
PDF
@Inproceedings{SalsaBotCVPR,
  author = {Song, Chan Hee and Wu, Jiaman and Byun, Ju-Seung, and Xu, Zexin and Pahuja, Vardaan and Bajaj, Goonmeet and Stevens, Samuel and Chen, Ziru and Su, Yu},
  title = {Towards a robust and generalizable embodied agent},
  year = {2023},
  url = {https://embodied-ai.org/papers/2023/10.pdf},
  booktitle = {Embodied AI Workshop at CVPR}
}
Exploring the Role of Artificial Intelligence in Facilitating Assessment of Writing Performance in Second Language LearningLanguages
Zilu Jiang, Zexin Xu, Zilong Pan, Jingwen He, Kui Xie
@Languages
PDF
@Article{languages8040247,
  AUTHOR = {Jiang, Zilu and Xu, Zexin and Pan, Zilong and He, Jingwen and Xie, Kui},
  TITLE = {Exploring the Role of Artificial Intelligence in Facilitating Assessment of Writing Performance in Second Language Learning},
  JOURNAL = {Languages},
  VOLUME = {8},
  YEAR = {2023},
  NUMBER = {4},
  NUMBER = {247},
  URL = {https://www.mdpi.com/2226-471X/8/4/247},
  ISSN = {2226-471X},
  ABSTRACT = {This study examined the robustness and efficiency of four large language models (LLMs), GPT-4, GPT-3.5, iFLYTEK and Baidu Cloud, in assessing the writing accuracy of the Chinese language. Writing samples were collected from students in an online high school Chinese language learning program in the US. The official APIs of the LLMs were utilized to conduct analyses at both the T-unit and sentence levels. Performance metrics were employed to evaluate the LLMs’ performance. The LLM results were compared to human rating results. Content analysis was conducted to categorize error types and highlight the discrepancies between human and LLM ratings. Additionally, the efficiency of each model was evaluated. The results indicate that GPT models and iFLYTEK achieved similar accuracy scores, with GPT-4 excelling in precision. These findings provide insights into the potential of LLMs in supporting the assessment of writing accuracy for language learners.},
  DOI = {10.3390/languages8040247}
}
2022
Measuring Elementary Students’ Behavioral Engagement in Web-based Science Inquiry LearningJOLR
Jingwen He, Bihui Jin, Zexin Xu, Danhui Zhang
@Journal of Online Learning Research
PDF
@article{HeJinXu2022vk,
  author = { Jingwen He and Bihui Jin and Zexin Xu and Danhui Zhang },
  title = { Measuring Elementary Students’ Behavioral Engagement in Web-based Science Inquiry Learning },
  journal = { Journal of Online Learning Research },
  note = { Special Issue: Exploring (Dis-)Engagement in K-12 Online and Blended Learning },
  volume = { 8 },
  number = { 3 },
  year = { 2022 },
  month = { November },
  pages = { 289--313 },
  address = { Waynesville, NC USA },
  publisher = { Association for the Advancement of Computing in Education (AACE) },
  issn = { 2374-1473 },
  abstract = { With the development of web-based science inquiry learning, behavioral engagement in such learning contexts received more and more attention. Combined with specific science inquiry stages: comparative experiment design, implementation with computer simulation, and reflection on results, the current study explored a series of features from log data to conceptualize students’ behavioral engagement. The features were divided into three categories: general engagement features including time, game the system, submission frequency, and revisiting behavior; learning content related features including context consistency, comparative experimental design, and experiment design consistency; and instruction related features consisting of revision behavior and revision improvement. 220 sixth graders from four classes in China participated in the study. Correlation and regression analysis were used to analyze the relationship between engagement features and learning performance... },
  url = { https://www.learntechlib.org/p/221456 }
}