@article{Costa-jussà-nllbnature,
  author = {Marta R. Costa-jussà and James Cross and Onur Çelebi and Maha Elbayad and Kenneth Heafield and Kevin Heffernan and Elahe Kalbassi and Janice Lam and Daniel Licht and Jean Maillard and Anna Sun and Skyler Wang and Guillaume Wenzek and Al Youngblood and Bapi Akula and Loic Barrault and Gabriel Mejia Gonzalez and Prangthip Hansanti and John Hoffman and Semarley Jarrett and Kaushik Ram Sadagopan and Dirk Rowe and Shannon Spruit and Chau Tran and Pierre Andrews and Necip Fazil Ayan and Shruti Bhosale and Sergey Edunov and Angela Fan and Cynthia Gao and Vedanuj Goswami and Francisco Guzmán and Philipp Koehn and Alexandre Mourachko and Christophe Ropers and Safiyyah Saleem and Holger Schwenk and Jeff Wang},
  title = {Scaling neural machine translation to 200 languages},
  year = {2024},
  month = jun,
  month_numeric = {6},
  day = {5},
  journal = {Nature},
  volume = {630},
  pages = {841--846},
  url = {https://kheafield.com/papers/facebook/nllb\_nature.pdf}
}
@inproceedings{pal-etal-2024-document,
  title = {Document-Level Machine Translation with Large-Scale Public Parallel Corpora},
  author = {Pal, Proyag and Birch, Alexandra and Heafield, Kenneth},
  editor = {Ku, Lun-Wei and Martins, Andre and Srikumar, Vivek},
  booktitle = {Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month = aug,
  year = {2024},
  address = {Bangkok, Thailand},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.acl-long.712/},
  doi = {10.18653/v1/2024.acl-long.712},
  pages = {13185--13197},
  month_numeric = {8}
}
@misc{grattafiori2024llama3herdmodels,
  title = {The Llama 3 Herd of Models},
  author = {Grattafiori, Aaron and Dubey, Abhimanyu and Jauhri, Abhinav and Pandey, Abhinav and Kadian, Abhishek and Al-Dahle, Ahmad and Letman, Aiesha and Mathur, Akhil and Schelten, Alan and Vaughan, Alex and Yang, Amy and Fan, Angela and Goyal, Anirudh and Hartshorn, Anthony and Yang, Aobo and Mitra, Archi and Sravankumar, Archie and Korenev, Artem and Hinsvark, Arthur and Rao, Arun and Zhang, Aston and Rodriguez, Aurelien and Gregerson, Austen and Spataru, Ava and Roziere, Baptiste and Biron, Bethany and Tang, Binh and Chern, Bobbie and Caucheteux, Charlotte and Nayak, Chaya and Bi, Chloe and Marra, Chris and McConnell, Chris and Keller, Christian and Touret, Christophe and Wu, Chunyang and Wong, Corinne and Ferrer, Cristian Canton and Nikolaidis, Cyrus and Allonsius, Damien and Song, Daniel and Pintz, Danielle and Livshits, Danny and Wyatt, Danny and Esiobu, David and Choudhary, Dhruv and Mahajan, Dhruv and Garcia-Olano, Diego and Perino, Diego and Hupkes, Dieuwke and Lakomkin, Egor and AlBadawy, Ehab and Lobanova, Elina and Dinan, Emily and Smith, Eric Michael and Radenovic, Filip and Guzmán, Francisco and Zhang, Frank and Synnaeve, Gabriel and Lee, Gabrielle and Anderson, Georgia Lewis and Thattai, Govind and Nail, Graeme and Mialon, Gregoire and Pang, Guan and Cucurell, Guillem and Nguyen, Hailey and Korevaar, Hannah and Xu, Hu and Touvron, Hugo and Zarov, Iliyan and Ibarra, Imanol Arrieta and Kloumann, Isabel and Misra, Ishan and Evtimov, Ivan and Zhang, Jack and Copet, Jade and Lee, Jaewon and Geffert, Jan and Vranes, Jana and Park, Jason and Mahadeokar, Jay and Shah, Jeet and van der Linde, Jelmer and Billock, Jennifer and Hong, Jenny and Lee, Jenya and Fu, Jeremy and Chi, Jianfeng and Huang, Jianyu and Liu, Jiawen and Wang, Jie and Yu, Jiecao and Bitton, Joanna and Spisak, Joe and Park, Jongsoo and Rocca, Joseph and Johnstun, Joshua and Saxe, Joshua and Jia, Junteng and Alwala, Kalyan Vasuden and Prasad, Karthik and Upasani, Kartikeya and Plawiak, Kate and Li, Ke and Heafield, Kenneth and Stone, Kevin and El-Arini, Khalid and Iyer, Krithika and Malik, Kshitiz and Chiu, Kuenley and Bhalla, Kunal and Lakhotia, Kushal and Rantala-Yeary, Lauren and van der Maaten, Laurens and Chen, Lawrence and Tan, Liang and Jenkins, Liz and Martin, Louis and Madaan, Lovish and Malo, Lubo and Blecher, Lukas and Landzaat, Lukas and de Oliveira, Luke and Muzzi, Madeline and Pasupuleti, Mahesh and Singh, Mannat and Paluri, Manohar and Kardas, Marcin and Tsimpoukelli, Maria and Oldham, Mathew and Rita, Mathieu and Pavlova, Maya and Kambadur, Melanie and Lewis, Mike and Si, Min and Singh, Mitesh Kumar and Hassan, Mona and Goyal, Naman and Torabi, Narjes and Bashlykov, Nikolay and Bogoychev, Nikolay and Chatterji, Niladri and Zhang, Ning and Duchenne, Olivier and Çelebi, Onur and Alrassy, Patrick and Zhang, Pengchuan and Li, Pengwei and Vasic, Petar and Weng, Peter and Bhargava, Prajjwal and Dubal, Pratik and Krishnan, Praveen and Koura, Punit Singh and Xu, Puxin and He, Qing and Dong, Qingxiao and Srinivasan, Ragavan and Ganapathy, Raj and Calderer, Ramon and Cabral, Ricardo Silveira and Stojnic, Robert and Raileanu, Roberta and Maheswari, Rohan and Girdhar, Rohit and Patel, Rohit and Sauvestre, Romain and Polidoro, Ronnie and Sumbaly, Roshan and Taylor, Ross and Silva, Ruan and Hou, Rui and Wang, Rui and Hosseini, Saghar and Chennabasappa, Sahana and Singh, Sanjay and Bell, Sean and Kim, Seohyun Sonia and Edunov, Sergey and Nie, Shaoliang and Narang, Sharan and Raparthy, Sharath and Shen, Sheng and Wan, Shengye and Bhosale, Shruti and Zhang, Shun and Vandenhende, Simon and Batra, Soumya and Whitman, Spencer and Sootla, Sten and Collot, Stephane and Gururangan, Suchin and Borodinsky, Sydney and Herman, Tamar and Fowler, Tara and Sheasha, Tarek and Georgiou, Thomas and Scialom, Thomas and Speckbacher, Tobias and Mihaylov, Todor and Xiao, Tong and Karn, Ujjwal and Goswami, Vedanuj and Gupta, Vibhor and Ramanathan, Vignesh and Kerkez, Viktor and Gonguet, Vincent and Do, Virginie and Vogeti, Vish and Albiero, Vítor and Petrovic, Vladan and Chu, Weiwei and Xiong, Wenhan and Fu, Wenyin and Meers, Whitney and Martinet, Xavier and Wang, Xiaodong and Wang, Xiaofang and Tan, Xiaoqing Ellen and Xia, Xide and Xie, Xinfeng and Jia, Xuchao and Wang, Xuewei and Goldschlag, Yaelle and Gaur, Yashesh and Babaei, Yasmine and Wen, Yi and Song, Yiwen and Zhang, Yuchen and Li, Yue and Mao, Yuning and Coudert, Zacharie Delpierre and Yan, Zheng and Chen, Zhengxing and Papakipos, Zoe and Singh, Aaditya and Srivastava, Aayushi and Jain, Abha and Kelsey, Adam and Shajnfeld, Adam and Gangidi, Adithya and Victoria, Adolfo and Goldstand, Ahuva and Menon, Ajay and Sharma, Ajay and Boesenberg, Alex and Baevski, Alexei and Feinstein, Allie and Kallet, Amanda and Sangani, Amit and Teo, Amos and Yunus, Anam and Lupu, Andrei and Alvarado, Andres and Caples, Andrew and Gu, Andrew and Ho, Andrew and Poulton, Andrew and Ryan, Andrew and Ramchandani, Ankit and Dong, Annie and Franco, Annie and Goyal, Anuj and Saraf, Aparajita and Chowdhury, Arkabandhu and Gabriel, Ashley and Bharambe, Ashwin and Eisenman, Assaf and Yazdan, Azadeh and James, Beau and Maurer, Ben and Leonhardi, Benjamin and Huang, Bernie and Loyd, Beth and Paola, Beto De and Paranjape, Bhargavi and Liu, Bing and Wu, Bo and Ni, Boyu and Hancock, Braden and Wasti, Bram and Spence, Brandon and Stojkovic, Brani and Gamido, Brian and Montalvo, Britt and Parker, Carl and Burton, Carly and Mejia, Catalina and Liu, Ce and Wang, Changhan and Kim, Changkyu and Zhou, Chao and Hu, Chester and Chu, Ching-Hsiang and Cai, Chris and Tindal, Chris and Feichtenhofer, Christoph and Gao, Cynthia and Civin, Damon and Beaty, Dana and Kreymer, Daniel and Li, Daniel and Adkins, David and Xu, David and Testuggine, Davide and David, Delia and Parikh, Devi and Liskovich, Diana and Foss, Didem and Wang, Dingkang and Le, Duc and Holland, Dustin and Dowling, Edward and Jamil, Eissa and Montgomery, Elaine and Presani, Eleonora and Hahn, Emily and Wood, Emily and Le, Eric-Tuan and Brinkman, Erik and Arcaute, Esteban and Dunbar, Evan and Smothers, Evan and Sun, Fei and Kreuk, Felix and Tian, Feng and Kokkinos, Filippos and Ozgenel, Firat and Caggioni, Francesco and Kanayet, Frank and Seide, Frank and Florez, Gabriela Medina and Schwarz, Gabriella and Badeer, Gada and Swee, Georgia and Halpern, Gil and Herman, Grant and Sizov, Grigory and Guangyi and Zhang and Lakshminarayanan, Guna and Inan, Hakan and Shojanazeri, Hamid and Zou, Han and Wang, Hannah and Zha, Hanwen and Habeeb, Haroun and Rudolph, Harrison and Suk, Helen and Aspegren, Henry and Goldman, Hunter and Zhan, Hongyuan and Damlaj, Ibrahim and Molybog, Igor and Tufanov, Igor and Leontiadis, Ilias and Veliche, Irina-Elena and Gat, Itai and Weissman, Jake and Geboski, James and Kohli, James and Lam, Janice and Asher, Japhet and Gaya, Jean-Baptiste and Marcus, Jeff and Tang, Jeff and Chan, Jennifer and Zhen, Jenny and Reizenstein, Jeremy and Teboul, Jeremy and Zhong, Jessica and Jin, Jian and Yang, Jingyi and Cummings, Joe and Carvill, Jon and Shepard, Jon and McPhie, Jonathan and Torres, Jonathan and Ginsburg, Josh and Wang, Junjie and Wu, Kai and U, Kam Hou and Saxena, Karan and Khandelwal, Kartikay and Zand, Katayoun and Matosich, Kathy and Veeraraghavan, Kaushik and Michelena, Kelly and Li, Keqian and Jagadeesh, Kiran and Huang, Kun and Chawla, Kunal and Huang, Kyle and Chen, Lailin and Garg, Lakshya and A, Lavender and Silva, Leandro and Bell, Lee and Zhang, Lei and Guo, Liangpeng and Yu, Licheng and Moshkovich, Liron and Wehrstedt, Luca and Khabsa, Madian and Avalani, Manav and Bhatt, Manish and Mankus, Martynas and Hasson, Matan and Lennie, Matthew and Reso, Matthias and Groshev, Maxim and Naumov, Maxim and Lathi, Maya and Keneally, Meghan and Liu, Miao and Seltzer, Michael L. and Valko, Michal and Restrepo, Michelle and Patel, Mihir and Vyatskov, Mik and Samvelyan, Mikayel and Clark, Mike and Macey, Mike and Wang, Mike and Hermoso, Miquel Jubert and Metanat, Mo and Rastegari, Mohammad and Bansal, Munish and Santhanam, Nandhini and Parks, Natascha and White, Natasha and Bawa, Navyata and Singhal, Nayan and Egebo, Nick and Usunier, Nicolas and Mehta, Nikhil and Laptev, Nikolay Pavlovich and Dong, Ning and Cheng, Norman and Chernoguz, Oleg and Hart, Olivia and Salpekar, Omkar and Kalinli, Ozlem and Kent, Parkin and Parekh, Parth and Saab, Paul and Balaji, Pavan and Rittner, Pedro and Bontrager, Philip and Roux, Pierre and Dollar, Piotr and Zvyagina, Polina and Ratanchandani, Prashant and Yuvraj, Pritish and Liang, Qian and Alao, Rachad and Rodriguez, Rachel and Ayub, Rafi and Murthy, Raghotham and Nayani, Raghu and Mitra, Rahul and Parthasarathy, Rangaprabhu and Li, Raymond and Hogan, Rebekkah and Battey, Robin and Wang, Rocky and Howes, Russ and Rinott, Ruty and Mehta, Sachin and Siby, Sachin and Bondu, Sai Jayesh and Datta, Samyak and Chugh, Sara and Hunt, Sara and Dhillon, Sargun and Sidorov, Sasha and Pan, Satadru and Mahajan, Saurabh and Verma, Saurabh and Yamamoto, Seiji and Ramaswamy, Sharadh and Lindsay, Shaun and Lindsay, Shaun and Feng, Sheng and Lin, Shenghao and Zha, Shengxin Cindy and Patil, Shishir and Shankar, Shiva and Zhang, Shuqiang and Zhang, Shuqiang and Wang, Sinong and Agarwal, Sneha and Sajuyigbe, Soji and Chintala, Soumith and Max, Stephanie and Chen, Stephen and Kehoe, Steve and Satterfield, Steve and Govindaprasad, Sudarshan and Gupta, Sumit and Deng, Summer and Cho, Sungmin and Virk, Sunny and Subramanian, Suraj and Choudhury, Sy and Goldman, Sydney and Remez, Tal and Glaser, Tamar and Best, Tamara and Koehler, Thilo and Robinson, Thomas and Li, Tianhe and Zhang, Tianjun and Matthews, Tim and Chou, Timothy and Shaked, Tzook and Vontimitta, Varun and Ajayi, Victoria and Montanez, Victoria and Mohan, Vijai and Kumar, Vinay Satish and Mangla, Vishal and Ionescu, Vlad and Poenaru, Vlad and Mihailescu, Vlad Tiberiu and Ivanov, Vladimir and Li, Wei and Wang, Wenchen and Jiang, Wenwen and Bouaziz, Wes and Constable, Will and Tang, Xiaocheng and Wu, Xiaojian and Wang, Xiaolan and Wu, Xilun and Gao, Xinbo and Kleinman, Yaniv and Chen, Yanjun and Hu, Ye and Jia, Ye and Qi, Ye and Li, Yenda and Zhang, Yilin and Zhang, Ying and Adi, Yossi and Nam, Youngjin and Yu and Wang and Zhao, Yu and Hao, Yuchen and Qian, Yundi and Li, Yunlu and He, Yuzi and Rait, Zach and DeVito, Zachary and Rosnbrick, Zef and Wen, Zhaoduo and Yang, Zhenyu and Zhao, Zhiwei and Ma, Zhiyu},
  year = {2024},
  eprint = {2407.21783},
  archiveprefix = {arXiv},
  primaryclass = {cs.AI},
  url = {https://arxiv.org/abs/2407.21783}
}
@inproceedings{chen-etal-2024-iterative,
  title = {Iterative Translation Refinement with Large Language Models},
  author = {Chen, Pinzhen and Guo, Zhicheng and Haddow, Barry and Heafield, Kenneth},
  editor = {Scarton, Carolina and Prescott, Charlotte and Bayliss, Chris and Oakley, Chris and Wright, Joanna and Wrigley, Stuart and Song, Xingyi and Gow-Smith, Edward and Bawden, Rachel and S{\'a}nchez-Cartagena, V{\'i}ctor M and Cadwell, Patrick and Lapshinova-Koltunski, Ekaterina and Cabarr{\~a}o, Vera and Chatzitheodorou, Konstantinos and Nurminen, Mary and Kanojia, Diptesh and Moniz, Helena},
  booktitle = {Proceedings of the 25th Annual Conference of the European Association for Machine Translation (Volume 1)},
  month = jun,
  year = {2024},
  address = {Sheffield, UK},
  publisher = {European Association for Machine Translation (EAMT)},
  url = {https://aclanthology.org/2024.eamt-1.17/},
  pages = {181--190},
  month_numeric = {6}
}
@inproceedings{burchell-etal-2024-code,
  title = {Code-Switched Language Identification is Harder Than You Think},
  author = {Burchell, Laurie and Birch, Alexandra and Thompson, Robert and Heafield, Kenneth},
  editor = {Graham, Yvette and Purver, Matthew},
  booktitle = {Proceedings of the 18th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month = mar,
  year = {2024},
  address = {St. Julian{'}s, Malta},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.eacl-long.38/},
  doi = {10.18653/v1/2024.eacl-long.38},
  pages = {646--658},
  month_numeric = {3}
}
@inproceedings{chen-etal-2024-monolingual,
  title = {Monolingual or Multilingual Instruction Tuning: Which Makes a Better Alpaca},
  author = {Chen, Pinzhen and Ji, Shaoxiong and Bogoychev, Nikolay and Kutuzov, Andrey and Haddow, Barry and Heafield, Kenneth},
  editor = {Graham, Yvette and Purver, Matthew},
  booktitle = {Findings of the Association for Computational Linguistics: EACL 2024},
  month = mar,
  year = {2024},
  address = {St. Julian{'}s, Malta},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2024.findings-eacl.90/},
  pages = {1347--1356},
  month_numeric = {3}
}
@inproceedings{burchell-etal-2023-open,
  title = {An Open Dataset and Model for Language Identification},
  author = {Burchell, Laurie and Birch, Alexandra and Bogoychev, Nikolay and Heafield, Kenneth},
  editor = {Rogers, Anna and Boyd-Graber, Jordan and Okazaki, Naoaki},
  booktitle = {Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
  month = jul,
  year = {2023},
  address = {Toronto, Canada},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2023.acl-short.75/},
  doi = {10.18653/v1/2023.acl-short.75},
  pages = {865--879},
  abstract = {Language identification (LID) is a fundamental step in many natural language processing pipelines. However, current LID systems are far from perfect, particularly on lower-resource languages. We present a LID model which achieves a macro-average F1 score of 0.93 and a false positive rate of 0.033{\%} across 201 languages, outperforming previous work. We achieve this by training on a curated dataset of monolingual data, which we audit manually to ensure reliability. We make both the model and the dataset available to the research community. Finally, we carry out detailed analysis into our model{'}s performance, both in comparison to existing open models and by language class.},
  month_numeric = {7}
}
@inproceedings{pal-heafield-2023-cheating,
  title = {Cheating to Identify Hard Problems for Neural Machine Translation},
  author = {Pal, Proyag and Heafield, Kenneth},
  booktitle = {Findings of the Association for Computational Linguistics: EACL 2023},
  month = may,
  year = {2023},
  address = {Dubrovnik, Croatia},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2023.findings-eacl.120},
  pages = {1620--1631},
  abstract = {We identify hard problems for neural machine translation models by analyzing progressively higher-scoring translations generated by letting models cheat to various degrees. If a system cheats and still gets something wrong, that suggests it is a hard problem. We experiment with two forms of cheating: providing the model a compressed representation of the target as an additional input, and fine-tuning on the test set. Contrary to popular belief, we find that the most frequent tokens are not necessarily the most accurately translated due to these often being function words and punctuation that can be used more flexibly in translation, or content words which can easily be paraphrased. We systematically analyze system outputs to identify categories of tokens which are particularly hard for the model to translate, and find that this includes certain types of named entities, subordinating conjunctions, and unknown and foreign words. We also encounter a phenomenon where words, often names, which were not infrequent in the training data are still repeatedly mistranslated by the models {---} we dub this the Fleetwood Mac problem.},
  month_numeric = {5}
}
@article{Treviso-survey,
  author = {Marcos Treviso and Ji-Ung Lee and Tianchu Ji and Betty van Aken and Qingqing Cao and Manuel R. Ciosici and Michael Hassid and Kenneth Heafield and Sara Hooker and Colin Raffel and Pedro H. Martins and André F. T. Martins and Jessica Zosa Forde and Peter Milder and Edwin Simpson and Noam Slonim and Jesse Dodge and Emma Strubell and Niranjan Balasubramanian and Leon Derczynski and Iryna Gurevych and Roy Schwartz},
  title = {Efficient Methods for Natural Language Processing: A Survey},
  year = {2023},
  month = mar,
  month_numeric = {3},
  day = {18},
  journal = {Transactions of the Association for Computational Linguistics},
  volume = {},
  url = {https://kheafield.com/papers/edinburgh/survey.pdf}
}
@inproceedings{bogoychev-etal-2022-edinburghs,
  title = {{E}dinburgh{'}s Submission to the {WMT} 2022 Efficiency Task},
  author = {Bogoychev, Nikolay and Behnke, Maximiliana and Van Der Linde, Jelmer and Nail, Graeme and Heafield, Kenneth and Zhang, Biao and Kashyap, Sidharth},
  booktitle = {Proceedings of the Seventh Conference on Machine Translation (WMT)},
  month = dec,
  year = {2022},
  address = {Abu Dhabi, United Arab Emirates (Hybrid)},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2022.wmt-1.63},
  pages = {661--667},
  abstract = {We participated in all tracks of the WMT 2022 efficient machine translation task: single-core CPU, multi-core CPU, and GPU hardware with throughput and latency conditions. Our submissions explores a number of several efficiency strategies: knowledge distillation, a simpler simple recurrent unit (SSRU) decoder with one or two layers, shortlisting, deep encoder, shallow decoder, pruning and bidirectional decoder. For the CPU track, we used quantized 8-bit models. For the GPU track, we used FP16 quantisation. We explored various pruning strategies and combination of one or more of the above methods.},
  month_numeric = {12}
}
@inproceedings{heafield-etal-2022-findings,
  title = {Findings of the {WMT} 2022 Shared Task on Efficient Translation},
  author = {Heafield, Kenneth and Zhang, Biao and Nail, Graeme and Van Der Linde, Jelmer and Bogoychev, Nikolay},
  booktitle = {Proceedings of the Seventh Conference on Machine Translation (WMT)},
  month = dec,
  year = {2022},
  address = {Abu Dhabi, United Arab Emirates (Hybrid)},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2022.wmt-1.4},
  pages = {100--108},
  abstract = {The machine translation efficiency task challenges participants to make their systems faster and smaller with minimal impact on translation quality. How much quality to sacrifice for efficiency depends upon the application, so participants were encouraged to make multiple submissions covering the space of trade-offs. In total, there were 76 submissions from 5 teams. The task covers GPU, single-core CPU, and multi-core CPU hardware tracks as well as batched throughput or single-sentence latency conditions. Submissions showed hundreds of millions of words can be translated for a dollar, average latency is 3.5{--}25 ms, and models fit in 7.5{--}900 MB.},
  month_numeric = {12}
}
@inproceedings{chen-heafield-2022-approaching,
  title = {Approaching Neural {C}hinese Word Segmentation as a Low-Resource Machine Translation Task},
  author = {Chen, Pinzhen and Heafield, Kenneth},
  booktitle = {Proceedings of the 36th Pacific Asia Conference on Language, Information and Computation},
  month = oct,
  year = {2022},
  address = {Manila, Philippines},
  publisher = {De La Salle University},
  url = {https://aclanthology.org/2022.paclic-1.66},
  pages = {600--606},
  month_numeric = {10}
}
@inproceedings{turcan-etal-2022-constrained,
  title = {Constrained Regeneration for Cross-Lingual Query-Focused Extractive Summarization},
  author = {Turcan, Elsbeth and Wan, David and Ladhak, Faisal and Galuscakova, Petra and Sen, Sukanta and Tchistiakova, Svetlana and Xu, Weijia and Carpuat, Marine and Heafield, Kenneth and Oard, Douglas and McKeown, Kathleen},
  booktitle = {Proceedings of the 29th International Conference on Computational Linguistics},
  month = oct,
  year = {2022},
  address = {Gyeongju, Republic of Korea},
  publisher = {International Committee on Computational Linguistics},
  url = {https://aclanthology.org/2022.coling-1.236},
  pages = {2668--2680},
  abstract = {Query-focused summaries of foreign-language, retrieved documents can help a user understand whether a document is actually relevant to the query term. A standard approach to this problem is to first translate the source documents and then perform extractive summarization to find relevant snippets. However, in a cross-lingual setting, the query term does not necessarily appear in the translations of relevant documents. In this work, we show that constrained machine translation and constrained post-editing can improve human relevance judgments by including a query term in a summary when its translation appears in the source document. We also present several strategies for selecting only certain documents for regeneration which yield further improvements},
  month_numeric = {10}
}
@misc{nllbteam2022languageleftbehindscaling,
  title = {No Language Left Behind: Scaling Human-Centered Machine Translation},
  author = {Team, NLLB and Costa-jussà, Marta R. and Cross, James and Çelebi, Onur and Elbayad, Maha and Heafield, Kenneth and Heffernan, Kevin and Kalbassi, Elahe and Lam, Janice and Licht, Daniel and Maillard, Jean and Sun, Anna and Wang, Skyler and Wenzek, Guillaume and Youngblood, Al and Akula, Bapi and Barrault, Loic and Gonzalez, Gabriel Mejia and Hansanti, Prangthip and Hoffman, John and Jarrett, Semarley and Sadagopan, Kaushik Ram and Rowe, Dirk and Spruit, Shannon and Tran, Chau and Andrews, Pierre and Ayan, Necip Fazil and Bhosale, Shruti and Edunov, Sergey and Fan, Angela and Gao, Cynthia and Goswami, Vedanuj and Guzmán, Francisco and Koehn, Philipp and Mourachko, Alexandre and Ropers, Christophe and Saleem, Safiyyah and Schwenk, Holger and Wang, Jeff},
  year = {2022},
  eprint = {2207.04672},
  archiveprefix = {arXiv},
  primaryclass = {cs.CL},
  url = {https://arxiv.org/abs/2207.04672}
}
@inproceedings{pal-heafield-2022-cheat,
  title = {Cheat Codes to Quantify Missing Source Information in Neural Machine Translation},
  author = {Pal, Proyag and Heafield, Kenneth},
  booktitle = {Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
  month = jul,
  year = {2022},
  address = {Seattle, United States},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2022.naacl-main.177},
  doi = {10.18653/v1/2022.naacl-main.177},
  pages = {2472--2477},
  abstract = {This paper describes a method to quantify the amount of information $H(t|s)$ added by the target sentence $t$ that is not present in the source $s$ in a neural machine translation system. We do this by providing the model the target sentence in a highly compressed form (a {``}cheat code{''}), and exploring the effect of the size of the cheat code. We find that the model is able to capture extra information from just a single float representation of the target and nearly reproduces the target with two 32-bit floats per target token.},
  month_numeric = {7}
}
@inproceedings{burchell-birch-and-kenneth-heafield-2022-exploring,
  title = {Exploring diversity in back translation for low-resource machine translation},
  author = {Burchell, Laurie and Birch, Alexandra and Heafield, Kenneth},
  booktitle = {Proceedings of the Third Workshop on Deep Learning for Low-Resource Natural Language Processing},
  month = jul,
  year = {2022},
  address = {Hybrid},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2022.deeplo-1.8},
  doi = {10.18653/v1/2022.deeplo-1.8},
  pages = {67--79},
  abstract = {t},
  month_numeric = {7}
}
@inproceedings{Heafield-europat,
  author = {Kenneth Heafield and Elaine Farrow and Jelmer van der Linde and Gema Ramírez-Sánchez and Dion Wiggins},
  title = {The {EuroPat} Corpus: A Parallel Corpus of European Patent Data},
  year = {2022},
  month = jun,
  month_numeric = {6},
  booktitle = {Proceedings of the 13th Language Resources and Evaluation Conference},
  address = {Marseille, France},
  url = {https://kheafield.com/papers/edinburgh/europat.pdf}
}
@inproceedings{Bogoychev-translatelocally,
  author = {Nikolay Bogoychev and Jelmer van der Linde and Kenneth Heafield},
  title = {{TranslateLocally:} Blazing-fast translation running on the local {CPU}},
  year = {2021},
  month = nov,
  month_numeric = {11},
  booktitle = {Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing},
  address = {Punta Cana, Dominican Republic},
  url = {https://kheafield.com/papers/edinburgh/translatelocally.pdf}
}
@inproceedings{akhbardeh-EtAl:2021:WMT,
  author = {Akhbardeh, Farhad and Arkhangorodsky, Arkady and Biesialska, Magdalena and Bojar, Ondřej and Chatterjee, Rajen and Chaudhary, Vishrav and Costa-jussa, Marta R. and España-Bonet, Cristina and Fan, Angela and Federmann, Christian and Freitag, Markus and Graham, Yvette and Grundkiewicz, Roman and Haddow, Barry and Harter, Leonie and Heafield, Kenneth and Homan, Christopher and Huck, Matthias and Amponsah-Kaakyire, Kwabena and Kasai, Jungo and Khashabi, Daniel and Knight, Kevin and Kocmi, Tom and Koehn, Philipp and Lourie, Nicholas and Monz, Christof and Morishita, Makoto and Nagata, Masaaki and Nagesh, Ajay and Nakazawa, Toshiaki and Negri, Matteo and Pal, Santanu and Tapo, Allahsera Auguste and Turchi, Marco and Vydrin, Valentin and Zampieri, Marcos},
  title = {Findings of the 2021 Conference on Machine Translation (WMT21)},
  booktitle = {Proceedings of the Sixth Conference on Machine Translation},
  month = nov,
  year = {2021},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  pages = {1--88},
  abstract = {This paper presents the results of the newstranslation task, the multilingual low-resourcetranslation for Indo-European languages, thetriangular translation task, and the automaticpost-editing task organised as part of the Con-ference on Machine Translation (WMT) 2021.In the news task, participants were asked tobuild machine translation systems for any of10 language pairs, to be evaluated on test setsconsisting mainly of news stories. The taskwas also opened up to additional test suites toprobe specific aspects of translation.},
  url = {https://aclanthology.org/2021.wmt-1.1},
  month_numeric = {11}
}
@inproceedings{Heafield-wmt21-speedtask,
  author = {Kenneth Heafield and Qianqian Zhu and Roman Grundkiewicz},
  title = {Findings of the {WMT} 2021 Shared Task on Efficient Translation},
  year = {2021},
  month = nov,
  month_numeric = {11},
  booktitle = {Proceedings of the Conference on Machine Translation at the 2021 Conference on Empirical Methods in Natural Language Processing},
  address = {Punta Cana, Dominican Republic},
  url = {https://kheafield.com/papers/edinburgh/wmt21-speedtask.pdf}
}
@inproceedings{Behnke-wmt21-speed,
  author = {Maximiliana Behnke and Nikolay Bogoychev and Alham Fikri Aji and Kenneth Heafield and Graeme Nail and Qianqian Zhu and Svetlana Tchistiakova and Jelmer van der Linde and Pinzhen Chen and Sidharth Kashyap and Roman Grundkiewicz},
  title = {Efficient Machine Translation with Model Pruning and Quantization},
  year = {2021},
  month = nov,
  month_numeric = {11},
  booktitle = {Proceedings of the Conference on Machine Translation at the 2021 Conference on Empirical Methods in Natural Language Processing},
  address = {Punta Cana, Dominican Republic},
  url = {https://kheafield.com/papers/edinburgh/wmt21-speed.pdf}
}
@inproceedings{Chen-wmt21-news,
  author = {Pinzhen Chen and Jindřich Helcl and Ulrich Germann and Laurie Burchell and Nikolay Bogoychev and Miceli Barone, Antonio Valerio and Jonas Waldendorf and Alexandra Birch and Kenneth Heafield},
  title = {The University of {Edinburgh's} English-German and English-Hausa Submissions to the {WMT21} News Translation Task},
  year = {2021},
  month = nov,
  month_numeric = {11},
  booktitle = {Proceedings of the Conference on Machine Translation at the 2021 Conference on Empirical Methods in Natural Language Processing},
  address = {Punta Cana, Dominican Republic},
  url = {https://kheafield.com/papers/edinburgh/wmt21-news.pdf}
}
@inproceedings{behnke-heafield:2021:WMT,
  author = {Behnke, Maximiliana and Heafield, Kenneth},
  title = {Pruning Neural Machine Translation for Speed Using Group Lasso},
  booktitle = {Proceedings of the Sixth Conference on Machine Translation},
  month = nov,
  year = {2021},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  pages = {1074--1086},
  abstract = {Unlike most work on pruning neural networks, we make inference faster. Group lasso regularisation enables pruning entire rows, columns or blocks of parameters that result in a smaller dense network. Because the network is still dense, efficient matrix multiply routines are still used and only minimal software changes are required to support variable layer sizes. Moreover, pruning is applied during training so there is no separate pruning step. Experiments on top of English->German models, which already have state-of-the-art speed and size, show that two-thirds of feedforward connections can be removed with 0.2 BLEU loss. With 6 decoder layers, the pruned model is 34\% faster; with 2 tied decoder layers, the pruned model is 14\% faster. Pruning entire heads and feedforward connections in a 12–1 encoder-decoder architecture gains an additional 51\% speed-up. These push the Pareto frontier with respect to the trade-off between time and quality compared to strong baselines. In the WMT 2021 Efficiency Task, our pruned and quantised models are 1.9–2.7x faster at the cost 0.9–1.7 BLEU in comparison to the unoptimised baselines. Across language pairs, we see similar sparsity patterns: an ascending or U-shaped distribution in encoder feedforward and attention layers and an ascending distribution in the decoder.},
  url = {https://aclanthology.org/2021.wmt-1.116},
  month_numeric = {11}
}
@inproceedings{renduchintala-etal-2021-gender,
  title = {Gender bias amplification during Speed-Quality optimization in Neural Machine Translation},
  author = {Renduchintala, Adithya and Diaz, Denise and Heafield, Kenneth and Li, Xian and Diab, Mona},
  booktitle = {Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)},
  month = aug,
  year = {2021},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2021.acl-short.15},
  doi = {10.18653/v1/2021.acl-short.15},
  pages = {99--109},
  abstract = {Is bias amplified when neural machine translation (NMT) models are optimized for speed and evaluated on generic test sets using BLEU? We investigate architectures and techniques commonly used to speed up decoding in Transformer-based models, such as greedy search, quantization, average attention networks (AANs) and shallow decoder models and show their effect on gendered noun translation. We construct a new gender bias test set, SimpleGEN, based on gendered noun phrases in which there is a single, unambiguous, correct answer. While we find minimal overall BLEU degradation as we apply speed optimizations, we observe that gendered noun translation performance degrades at a much faster rate.},
  month_numeric = {8}
}
@inproceedings{germann-EtAl:2020:WMT,
  author = {Germann, Ulrich and Grundkiewicz, Roman and Popel, Martin and Dobreva, Radina and Bogoychev, Nikolay and Heafield, Kenneth},
  title = {Speed-optimized, Compact Student Models that Distill Knowledge from a Larger Teacher Model: the UEDIN-CUNI Submission to the WMT 2020 News Translation Task},
  booktitle = {Proceedings of the Fifth Conference on Machine Translation},
  month = nov,
  year = {2020},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  pages = {190--195},
  abstract = {We describe the joint submission of the University of Edinburgh and Charles University, Prague, to the Czech/English track in the WMT 2020 Shared Task on News Translation. Our fast and compact student models distill knowledge from a larger, slower teacher. They are designed to offer a good trade-off between translation quality and inference efficiency. On the WMT 2020 Czech ↔ English test sets, they achieve translation speeds of over 700 whitespace-delimited source words per second on a single CPU thread, thus making neural translation feasible on consumer hardware without a GPU.},
  url = {https://www.aclweb.org/anthology/2020.wmt-1.17},
  month_numeric = {11}
}
@inproceedings{behnke-heafield-2020-losing,
  title = {Losing Heads in the Lottery: Pruning Transformer Attention in Neural Machine Translation},
  author = {Behnke, Maximiliana and Heafield, Kenneth},
  booktitle = {Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  month = nov,
  year = {2020},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://aclanthology.org/2020.emnlp-main.211},
  doi = {10.18653/v1/2020.emnlp-main.211},
  pages = {2664--2674},
  abstract = {The attention mechanism is the crucial component of the transformer architecture. Recent research shows that most attention heads are not confident in their decisions and can be pruned. However, removing them before training a model results in lower quality. In this paper, we apply the lottery ticket hypothesis to prune heads in the early stages of training. Our experiments on machine translation show that it is possible to remove up to three-quarters of attention heads from transformer-big during early training with an average -0.1 change in BLEU for Turkish→English. The pruned model is 1.5 times as fast at inference, albeit at the cost of longer training. Our method is complementary to other approaches, such as teacher-student, with English→German student model gaining an additional 10{\%} speed-up with 75{\%} encoder attention removed and 0.2 BLEU loss.},
  month_numeric = {11}
}
@inproceedings{domhan-etal-2020-sockeye,
  title = {The Sockeye 2 Neural Machine Translation Toolkit at {AMTA} 2020},
  author = {Domhan, Tobias and Denkowski, Michael and Vilar, David and Niu, Xing and Hieber, Felix and Heafield, Kenneth},
  booktitle = {Proceedings of the 14th Conference of the Association for Machine Translation in the Americas (AMTA 2020)},
  month = oct,
  year = {2020},
  address = {Virtual},
  publisher = {Association for Machine Translation in the Americas},
  url = {https://www.aclweb.org/anthology/2020.amta-research.10},
  pages = {110--115},
  month_numeric = {10}
}
@inproceedings{aji-heafield-2020-compressing,
  title = {Compressing Neural Machine Translation Models with 4-bit Precision},
  author = {Aji, Alham Fikri and Heafield, Kenneth},
  booktitle = {Proceedings of the Fourth Workshop on Neural Generation and Translation},
  month = jul,
  year = {2020},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/2020.ngt-1.4},
  pages = {35--42},
  abstract = {Neural Machine Translation (NMT) is resource-intensive. We design a quantization procedure to compress fit NMT models better for devices with limited hardware capability. We use logarithmic quantization, instead of the more commonly used fixed-point quantization, based on the empirical fact that parameters distribution is not uniform. We find that biases do not take a lot of memory and show that biases can be left uncompressed to improve the overall quality without affecting the compression rate. We also propose to use an error-feedback mechanism during retraining, to preserve the compressed model as a stale gradient. We empirically show that NMT models based on Transformer or RNN architecture can be compressed up to 4-bit precision without any noticeable quality degradation. Models can be compressed up to binary precision, albeit with lower quality. RNN architecture seems to be more robust towards compression, compared to the Transformer.},
  month_numeric = {7}
}
@inproceedings{bogoychev-etal-2020-edinburghs,
  title = {{E}dinburgh{'}s Submissions to the 2020 Machine Translation Efficiency Task},
  author = {Bogoychev, Nikolay and Grundkiewicz, Roman and Aji, Alham Fikri and Behnke, Maximiliana and Heafield, Kenneth and Kashyap, Sidharth and Farsarakis, Emmanouil-Ioannis and Chudyk, Mateusz},
  booktitle = {Proceedings of the Fourth Workshop on Neural Generation and Translation},
  month = jul,
  year = {2020},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/2020.ngt-1.26},
  pages = {218--224},
  abstract = {We participated in all tracks of the Workshop on Neural Generation and Translation 2020 Efficiency Shared Task: single-core CPU, multi-core CPU, and GPU. At the model level, we use teacher-student training with a variety of student sizes, tie embeddings and sometimes layers, use the Simpler Simple Recurrent Unit, and introduce head pruning. On GPUs, we used 16-bit floating-point tensor cores. On CPUs, we customized 8-bit quantization and multiple processes with affinity for the multi-core setting. To reduce model size, we experimented with 4-bit log quantization but use floats at runtime. In the shared task, most of our submissions were Pareto optimal with respect the trade-off between time and quality.},
  month_numeric = {7}
}
@inproceedings{heafield-etal-2020-findings,
  title = {Findings of the Fourth Workshop on Neural Generation and Translation},
  author = {Heafield, Kenneth and Hayashi, Hiroaki and Oda, Yusuke and Konstas, Ioannis and Finch, Andrew and Neubig, Graham and Li, Xian and Birch, Alexandra},
  booktitle = {Proceedings of the Fourth Workshop on Neural Generation and Translation},
  month = jul,
  year = {2020},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/2020.ngt-1.1},
  pages = {1--9},
  abstract = {We describe the finding of the Fourth Workshop on Neural Generation and Translation, held in concert with the annual conference of the Association for Computational Linguistics (ACL 2020). First, we summarize the research trends of papers presented in the proceedings. Second, we describe the results of the three shared tasks 1) efficient neural machine translation (NMT) where participants were tasked with creating NMT systems that are both accurate and efficient, and 2) document-level generation and translation (DGT) where participants were tasked with developing systems that generate summaries from structured data, potentially with assistance from text in another language and 3) STAPLE task: creation of as many possible translations of a given input text. This last shared task was organised by Duolingo.},
  month_numeric = {7}
}
@inproceedings{aji-etal-2020-neural,
  title = {In Neural Machine Translation, What Does Transfer Learning Transfer?},
  author = {Aji, Alham Fikri and Bogoychev, Nikolay and Heafield, Kenneth and Sennrich, Rico},
  booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
  month = jul,
  year = {2020},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/2020.acl-main.688},
  pages = {7701--7710},
  abstract = {Transfer learning improves quality for low-resource machine translation, but it is unclear what exactly it transfers. We perform several ablation studies that limit information transfer, then measure the quality impact across three language pairs to gain a black-box understanding of transfer learning. Word embeddings play an important role in transfer learning, particularly if they are properly aligned. Although transfer learning can be performed without embeddings, results are sub-optimal. In contrast, transferring only the embeddings but nothing else yields catastrophic results. We then investigate diagonal alignments with auto-encoders over real languages and randomly generated sequences, finding even randomly generated sequences as parents yield noticeable but smaller gains. Finally, transfer learning can eliminate the need for a warm-up phase when training transformer models in high resource language pairs.},
  month_numeric = {7}
}
@inproceedings{chen-etal-2020-parallel,
  title = {Parallel Sentence Mining by Constrained Decoding},
  author = {Chen, Pinzhen and Bogoychev, Nikolay and Heafield, Kenneth and Kirefu, Faheem},
  booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
  month = jul,
  year = {2020},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/2020.acl-main.152},
  pages = {1672--1678},
  abstract = {We present a novel method to extract parallel sentences from two monolingual corpora, using neural machine translation. Our method relies on translating sentences in one corpus, but constraining the decoding by a prefix tree built on the other corpus. We argue that a neural machine translation system by itself can be a sentence similarity scorer and it efficiently approximates pairwise comparison with a modified beam search. When benchmarked on the BUCC shared task, our method achieves results comparable to other submissions.},
  month_numeric = {7}
}
@inproceedings{banon-etal-2020-paracrawl,
  title = {{P}ara{C}rawl: Web-Scale Acquisition of Parallel Corpora},
  author = {Ba{\~n}{\'o}n, Marta and Chen, Pinzhen and Haddow, Barry and Heafield, Kenneth and Hoang, Hieu and Espl{\`a}-Gomis, Miquel and Forcada, Mikel L. and Kamran, Amir and Kirefu, Faheem and Koehn, Philipp and Ortiz Rojas, Sergio and Pla Sempere, Leopoldo and Ram{\'\i}rez-S{\'a}nchez, Gema and Sarr{\'\i}as, Elsa and Strelec, Marek and Thompson, Brian and Waites, William and Wiggins, Dion and Zaragoza, Jaume},
  booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
  month = jul,
  year = {2020},
  address = {Online},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/2020.acl-main.417},
  pages = {4555--4567},
  abstract = {We report on methods to create the largest publicly available parallel corpora by crawling the web, using open source software. We empirically compare alternative methods and publish benchmark data sets for sentence alignment and sentence pair filtering. We also describe the parallel corpora released and evaluate their quality and their usefulness to create machine translation systems.},
  month_numeric = {7}
}
@inproceedings{kim-etal-2019-research,
  title = {From Research to Production and Back: Ludicrously Fast Neural Machine Translation},
  author = {Kim, Young Jin and Junczys-Dowmunt, Marcin and Hassan, Hany and Fikri Aji, Alham and Heafield, Kenneth and Grundkiewicz, Roman and Bogoychev, Nikolay},
  booktitle = {Proceedings of the 3rd Workshop on Neural Generation and Translation},
  month = nov,
  year = {2019},
  address = {Hong Kong},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/D19-5632},
  doi = {10.18653/v1/D19-5632},
  pages = {280--288},
  abstract = {This paper describes the submissions of the {``}Marian{''} team to the WNGT 2019 efficiency shared task. Taking our dominating submissions to the previous edition of the shared task as a starting point, we develop improved teacher-student training via multi-agent dual-learning and noisy backward-forward translation for Transformer-based student models. For efficient CPU-based decoding, we propose pre-packed 8-bit matrix products, improved batched decoding, cache-friendly student architectures with parameter sharing and light-weight RNN-based decoder architectures. GPU-based decoding benefits from the same architecture changes, from pervasive 16-bit inference and concurrent streams. These modifications together with profiler-based C++ code optimization allow us to push the Pareto frontier established during the 2018 edition towards 24x (CPU) and 14x (GPU) faster models at comparable or higher BLEU values. Our fastest CPU model is more than 4x faster than last year{'}s fastest submission at more than 3 points higher BLEU. Our fastest GPU model at 1.5 seconds translation time is slightly faster than last year{'}s fastest RNN-based submissions, but outperforms them by more than 4 BLEU and 10 BLEU points respectively.},
  month_numeric = {11}
}
@inproceedings{currey-heafield-2019-zero,
  title = {Zero-Resource Neural Machine Translation with Monolingual Pivot Data},
  author = {Currey, Anna and Heafield, Kenneth},
  booktitle = {Proceedings of the 3rd Workshop on Neural Generation and Translation},
  month = nov,
  year = {2019},
  address = {Hong Kong},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/D19-5610},
  doi = {10.18653/v1/D19-5610},
  pages = {99--107},
  abstract = {Zero-shot neural machine translation (NMT) is a framework that uses source-pivot and target-pivot parallel data to train a source-target NMT system. An extension to zero-shot NMT is zero-resource NMT, which generates pseudo-parallel corpora using a zero-shot system and further trains the zero-shot system on that data. In this paper, we expand on zero-resource NMT by incorporating monolingual data in the pivot language into training; since the pivot language is usually the highest-resource language of the three, we expect monolingual pivot-language data to be most abundant. We propose methods for generating pseudo-parallel corpora using pivot-language monolingual data and for leveraging the pseudo-parallel corpora to improve the zero-shot NMT system. We evaluate these methods for a high-resource language pair (German-Russian) using English as the pivot. We show that our proposed methods yield consistent improvements over strong zero-shot and zero-resource baselines and even catch up to pivot-based models in BLEU (while not requiring the two-pass inference that pivot models require).},
  month_numeric = {11}
}
@inproceedings{aji-heafield-2019-making,
  title = {Making Asynchronous Stochastic Gradient Descent Work for Transformers},
  author = {Aji, Alham Fikri and Heafield, Kenneth},
  booktitle = {Proceedings of the 3rd Workshop on Neural Generation and Translation},
  month = nov,
  year = {2019},
  address = {Hong Kong},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/D19-5608},
  doi = {10.18653/v1/D19-5608},
  pages = {80--89},
  abstract = {Asynchronous stochastic gradient descent (SGD) converges poorly for Transformer models, so synchronous SGD has become the norm for Transformer training. This is unfortunate because asynchronous SGD is faster at raw training speed since it avoids waiting for synchronization. Moreover, the Transformer model is the basis for state-of-the-art models for several tasks, including machine translation, so training speed matters. To understand why asynchronous SGD under-performs, we blur the lines between asynchronous and synchronous methods. We find that summing several asynchronous updates, rather than applying them immediately, restores convergence behavior. With this method, the Transformer attains the same BLEU score 1.36 times as fast.},
  month_numeric = {11}
}
@inproceedings{aji-etal-2019-combining,
  title = {Combining Global Sparse Gradients with Local Gradients in Distributed Neural Network Training},
  author = {Aji, Alham Fikri and Heafield, Kenneth and Bogoychev, Nikolay},
  booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
  month = nov,
  year = {2019},
  address = {Hong Kong, China},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/D19-1373},
  doi = {10.18653/v1/D19-1373},
  pages = {3626--3631},
  abstract = {One way to reduce network traffic in multi-node data-parallel stochastic gradient descent is to only exchange the largest gradients. However, doing so damages the gradient and degrades the model{'}s performance. Transformer models degrade dramatically while the impact on RNNs is smaller. We restore gradient quality by combining the compressed global gradient with the node{'}s locally computed uncompressed gradient. Neural machine translation experiments show that Transformer convergence is restored while RNNs converge faster. With our method, training on 4 nodes converges up to 1.5x as fast as with uncompressed gradients and scales 3.5x relative to single-node training.},
  month_numeric = {11}
}
@inproceedings{currey-heafield-2019-incorporating,
  title = {Incorporating Source Syntax into Transformer-Based Neural Machine Translation},
  author = {Currey, Anna and Heafield, Kenneth},
  booktitle = {Proceedings of the Fourth Conference on Machine Translation (Volume 1: Research Papers)},
  month = aug,
  year = {2019},
  address = {Florence, Italy},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/W19-5203},
  doi = {10.18653/v1/W19-5203},
  pages = {24--33},
  abstract = {Transformer-based neural machine translation (NMT) has recently achieved state-of-the-art performance on many machine translation tasks. However, recent work (Raganato and Tiedemann, 2018; Tang et al., 2018; Tran et al., 2018) has indicated that Transformer models may not learn syntactic structures as well as their recurrent neural network-based counterparts, particularly in low-resource cases. In this paper, we incorporate constituency parse information into a Transformer NMT model. We leverage linearized parses of the source training sentences in order to inject syntax into the Transformer architecture without modifying it. We introduce two methods: a multi-task machine translation and parsing model with a single encoder and decoder, and a mixed encoder model that learns to translate directly from parsed and unparsed source sentences. We evaluate our methods on low-resource translation from English into twenty target languages, showing consistent improvements of 1.3 BLEU on average across diverse target languages for the multi-task technique. We further evaluate the models on full-scale WMT tasks, finding that the multi-task model aids low- and medium-resource NMT but degenerates high-resource English-German translation.},
  month_numeric = {8}
}
@inproceedings{grundkiewicz-etal-2019-neural,
  title = {Neural Grammatical Error Correction Systems with Unsupervised Pre-training on Synthetic Data},
  author = {Grundkiewicz, Roman and Junczys-Dowmunt, Marcin and Heafield, Kenneth},
  booktitle = {Proceedings of the Fourteenth Workshop on Innovative Use of NLP for Building Educational Applications},
  month = aug,
  year = {2019},
  address = {Florence, Italy},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/W19-4427},
  doi = {10.18653/v1/W19-4427},
  pages = {252--263},
  abstract = {Considerable effort has been made to address the data sparsity problem in neural grammatical error correction. In this work, we propose a simple and surprisingly effective unsupervised synthetic error generation method based on confusion sets extracted from a spellchecker to increase the amount of training data. Synthetic data is used to pre-train a Transformer sequence-to-sequence model, which not only improves over a strong baseline trained on authentic error-annotated data, but also enables the development of a practical GEC system in a scenario where little genuine error-annotated data is available. The developed systems placed first in the BEA19 shared task, achieving 69.47 and 64.24 F$_{0.5}$ in the restricted and low-resource tracks respectively, both on the W{\&}I+LOCNESS test set. On the popular CoNLL 2014 test set, we report state-of-the-art results of 64.16 M{\mbox{$^2$}} for the submitted system, and 61.30 M{\mbox{$^2$}} for the constrained system trained on the NUCLE and Lang-8 data.},
  month_numeric = {8}
}
@inproceedings{Oard-evia,
  author = {Douglas Oard and Petra Galuscakova and Kathleen McKeown and Marine Carpuat and Ramy Eskander and Kenneth Heafield and Efsun Kayi and Chris Kedzie and Smaranda Muresan and Suraj Nair and Xing Niu and Dragomir Radev and Anton Ragni and Han-Chin Shing and Yan Virin and Weijia Xu and Rui Zhang and Elena Zotkina and Joseph Barrow and Mark Gales},
  title = {Surprise Languages: Rapid-Response Cross-Language {IR}},
  year = {2019},
  month = jun,
  month_numeric = {6},
  day = {10},
  booktitle = {Proceedings of the The Ninth International Workshop on Evaluating Information Access (EVIA 2019)},
  address = {Tokyo, Japan},
  url = {https://kheafield.com/papers/edinburgh/evia.pdf}
}
@inproceedings{koehn-etal-2018-findings,
  title = {Findings of the {WMT} 2018 Shared Task on Parallel Corpus Filtering},
  author = {Koehn, Philipp and Khayrallah, Huda and Heafield, Kenneth and Forcada, Mikel L.},
  booktitle = {Proceedings of the Third Conference on Machine Translation: Shared Task Papers},
  month = oct,
  year = {2018},
  address = {Belgium, Brussels},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/W18-6453},
  doi = {10.18653/v1/W18-6453},
  pages = {726--739},
  abstract = {We posed the shared task of assigning sentence-level quality scores for a very noisy corpus of sentence pairs crawled from the web, with the goal of sub-selecting 1{\%} and 10{\%} of high-quality data to be used to train machine translation systems. Seventeen participants from companies, national research labs, and universities participated in this task.},
  month_numeric = {10}
}
@inproceedings{haddow-etal-2018-university,
  title = {The University of {E}dinburgh{'}s Submissions to the {WMT}18 News Translation Task},
  author = {Haddow, Barry and Bogoychev, Nikolay and Emelin, Denis and Germann, Ulrich and Grundkiewicz, Roman and Heafield, Kenneth and Miceli Barone, Antonio Valerio and Sennrich, Rico},
  booktitle = {Proceedings of the Third Conference on Machine Translation: Shared Task Papers},
  month = oct,
  year = {2018},
  address = {Belgium, Brussels},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/W18-6412},
  doi = {10.18653/v1/W18-6412},
  pages = {399--409},
  abstract = {The University of Edinburgh made submissions to all 14 language pairs in the news translation task, with strong performances in most pairs. We introduce new RNN-variant, mixed RNN/Transformer ensembles, data selection and weighting, and extensions to back-translation.},
  month_numeric = {10}
}
@inproceedings{currey-heafield-2018-multi,
  title = {Multi-Source Syntactic Neural Machine Translation},
  author = {Currey, Anna and Heafield, Kenneth},
  booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
  month = oct,
  year = {2018},
  address = {Brussels, Belgium},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/D18-1327},
  doi = {10.18653/v1/D18-1327},
  pages = {2961--2966},
  abstract = {We introduce a novel multi-source technique for incorporating source syntax into neural machine translation using linearized parses. This is achieved by employing separate encoders for the sequential and parsed versions of the same source sentence; the resulting representations are then combined using a hierarchical attention mechanism. The proposed model improves over both seq2seq and parsed baselines by over 1 BLEU on the WMT17 English-German task. Further analysis shows that our multi-source syntactic model is able to translate successfully without any parsed input, unlike standard parsed methods. In addition, performance does not deteriorate as much on long sentences as for the baselines.},
  month_numeric = {10}
}
@inproceedings{bogoychev-etal-2018-accelerating,
  title = {Accelerating Asynchronous Stochastic Gradient Descent for Neural Machine Translation},
  author = {Bogoychev, Nikolay and Heafield, Kenneth and Aji, Alham Fikri and Junczys-Dowmunt, Marcin},
  booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
  month = oct,
  year = {2018},
  address = {Brussels, Belgium},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/D18-1332},
  doi = {10.18653/v1/D18-1332},
  pages = {2991--2996},
  abstract = {In order to extract the best possible performance from asynchronous stochastic gradient descent one must increase the mini-batch size and scale the learning rate accordingly. In order to achieve further speedup we introduce a technique that delays gradient updates effectively increasing the mini-batch size. Unfortunately with the increase of mini-batch size we worsen the stale gradient problem in asynchronous stochastic gradient descent (SGD) which makes the model convergence poor. We introduce local optimizers which mitigate the stale gradient problem and together with fine tuning our momentum we are able to train a shallow machine translation system 27{\%} faster than an optimized baseline with negligible penalty in BLEU.},
  month_numeric = {10}
}
@inproceedings{junczys-dowmunt-etal-2018-marian-cost,
  title = {{M}arian: Cost-effective High-Quality Neural Machine Translation in {C}++},
  author = {Junczys-Dowmunt, Marcin and Heafield, Kenneth and Hoang, Hieu and Grundkiewicz, Roman and Aue, Anthony},
  booktitle = {Proceedings of the 2nd Workshop on Neural Machine Translation and Generation},
  month = jul,
  year = {2018},
  address = {Melbourne, Australia},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/W18-2716},
  doi = {10.18653/v1/W18-2716},
  pages = {129--135},
  abstract = {This paper describes the submissions of the {``}Marian{''} team to the WNMT 2018 shared task. We investigate combinations of teacher-student training, low-precision matrix products, auto-tuning and other methods to optimize the Transformer model on GPU and CPU. By further integrating these methods with the new averaging attention networks, a recently introduced faster Transformer variant, we create a number of high-quality, high-performance models on the GPU and CPU, dominating the Pareto frontier for this shared task.},
  month_numeric = {7}
}
@inproceedings{grundkiewicz-heafield-2018-neural,
  title = {Neural Machine Translation Techniques for Named Entity Transliteration},
  author = {Grundkiewicz, Roman and Heafield, Kenneth},
  booktitle = {Proceedings of the Seventh Named Entities Workshop},
  month = jul,
  year = {2018},
  address = {Melbourne, Australia},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/W18-2413},
  doi = {10.18653/v1/W18-2413},
  pages = {89--94},
  abstract = {Transliterating named entities from one language into another can be approached as neural machine translation (NMT) problem, for which we use deep attentional RNN encoder-decoder models. To build a strong transliteration system, we apply well-established techniques from NMT, such as dropout regularization, model ensembling, rescoring with right-to-left models, and back-translation. Our submission to the NEWS 2018 Shared Task on Named Entity Transliteration ranked first in several tracks.},
  month_numeric = {7}
}
@inproceedings{hoang-etal-2018-fast,
  title = {Fast Neural Machine Translation Implementation},
  author = {Hoang, Hieu and Dwojak, Tomasz and Krislauks, Rihards and Torregrosa, Daniel and Heafield, Kenneth},
  booktitle = {Proceedings of the 2nd Workshop on Neural Machine Translation and Generation},
  month = jul,
  year = {2018},
  address = {Melbourne, Australia},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/W18-2714},
  doi = {10.18653/v1/W18-2714},
  pages = {116--121},
  abstract = {This paper describes the submissions to the efficiency track for GPUs at the Workshop for Neural Machine Translation and Generation by members of the University of Edinburgh, Adam Mickiewicz University, Tilde and University of Alicante. We focus on efficient implementation of the recurrent deep-learning model as implemented in Amun, the fast inference engine for neural machine translation. We improve the performance with an efficient mini-batching algorithm, and by fusing the softmax operation with the k-best extraction algorithm. Submissions using Amun were first, second and third fastest in the GPU efficiency track.},
  month_numeric = {7}
}
@inproceedings{currey-heafield-2018-unsupervised,
  title = {Unsupervised Source Hierarchies for Low-Resource Neural Machine Translation},
  author = {Currey, Anna and Heafield, Kenneth},
  booktitle = {Proceedings of the Workshop on the Relevance of Linguistic Structure in Neural Architectures for {NLP}},
  month = jul,
  year = {2018},
  address = {Melbourne, Australia},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/W18-2902},
  doi = {10.18653/v1/W18-2902},
  pages = {6--12},
  abstract = {Incorporating source syntactic information into neural machine translation (NMT) has recently proven successful (Eriguchi et al., 2016; Luong et al., 2016). However, this is generally done using an outside parser to syntactically annotate the training data, making this technique difficult to use for languages or domains for which a reliable parser is not available. In this paper, we introduce an unsupervised tree-to-sequence (tree2seq) model for neural machine translation; this model is able to induce an unsupervised hierarchical structure on the source sentence based on the downstream task of neural machine translation. We adapt the Gumbel tree-LSTM of Choi et al. (2018) to NMT in order to create the encoder. We evaluate our model against sequential and supervised parsing baselines on three low- and medium-resource language pairs. For low-resource cases, the unsupervised tree2seq encoder significantly outperforms the baselines; no improvements are seen for medium-resource translation.},
  month_numeric = {7}
}
@inproceedings{junczys-dowmunt-etal-2018-marian,
  title = {{M}arian: Fast Neural Machine Translation in {C}++},
  author = {Junczys-Dowmunt, Marcin and Grundkiewicz, Roman and Dwojak, Tomasz and Hoang, Hieu and Heafield, Kenneth and Neckermann, Tom and Seide, Frank and Germann, Ulrich and Aji, Alham Fikri and Bogoychev, Nikolay and Martins, Andr{\'e} F. T. and Birch, Alexandra},
  booktitle = {Proceedings of {ACL} 2018, System Demonstrations},
  month = jul,
  year = {2018},
  address = {Melbourne, Australia},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/P18-4020},
  doi = {10.18653/v1/P18-4020},
  pages = {116--121},
  abstract = {We present Marian, an efficient and self-contained Neural Machine Translation framework with an integrated automatic differentiation engine based on dynamic computation graphs. Marian is written entirely in C++. We describe the design of the encoder-decoder framework and demonstrate that a research-friendly toolkit can achieve high training and translation speed.},
  month_numeric = {7}
}
@inproceedings{junczys-dowmunt-etal-2018-approaching,
  title = {Approaching Neural Grammatical Error Correction as a Low-Resource Machine Translation Task},
  author = {Junczys-Dowmunt, Marcin and Grundkiewicz, Roman and Guha, Shubha and Heafield, Kenneth},
  booktitle = {Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)},
  month = jun,
  year = {2018},
  address = {New Orleans, Louisiana},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/N18-1055},
  doi = {10.18653/v1/N18-1055},
  pages = {595--606},
  abstract = {Previously, neural methods in grammatical error correction (GEC) did not reach state-of-the-art results compared to phrase-based statistical machine translation (SMT) baselines. We demonstrate parallels between neural GEC and low-resource neural MT and successfully adapt several methods from low-resource MT to neural GEC. We further establish guidelines for trustable results in neural GEC and propose a set of model-independent methods for neural GEC that can be easily applied in most GEC settings. Proposed methods include adding source-side noise, domain-adaptation techniques, a GEC-specific training-objective, transfer learning with monolingual data, and ensembling of independently trained GEC models and language models. The combined effects of these methods result in better than state-of-the-art neural GEC models that outperform previously best neural GEC systems by more than 10{\%} M{\mbox{$^2$}} on the CoNLL-2014 benchmark and 5.9{\%} on the JFLEG test set. Non-neural state-of-the-art systems are outperformed by more than 2{\%} on the CoNLL-2014 benchmark and by 4{\%} on JFLEG.},
  month_numeric = {6}
}
@inproceedings{aji-heafield-2017-sparse,
  title = {Sparse Communication for Distributed Gradient Descent},
  author = {Aji, Alham Fikri and Heafield, Kenneth},
  booktitle = {Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing},
  month = sep,
  year = {2017},
  address = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/D17-1045},
  doi = {10.18653/v1/D17-1045},
  pages = {440--445},
  abstract = {We make distributed stochastic gradient descent faster by exchanging sparse updates instead of dense updates. Gradient updates are positively skewed as most updates are near zero, so we map the 99{\%} smallest updates (by absolute value) to zero then exchange sparse matrices. This method can be combined with quantization to further improve the compression. We explore different configurations and apply them to neural machine translation and MNIST image classification tasks. Most configurations work on MNIST, whereas different configurations reduce convergence rate on the more complex translation task. Our experiments show that we can achieve up to 49{\%} speed up on MNIST and 22{\%} on NMT without damaging the final accuracy or BLEU.},
  month_numeric = {9}
}
@inproceedings{currey-etal-2017-copied,
  title = {Copied Monolingual Data Improves Low-Resource Neural Machine Translation},
  author = {Currey, Anna and Miceli Barone, Antonio Valerio and Heafield, Kenneth},
  booktitle = {Proceedings of the Second Conference on Machine Translation},
  month = sep,
  year = {2017},
  address = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/W17-4715},
  doi = {10.18653/v1/W17-4715},
  pages = {148--156},
  month_numeric = {9}
}
@inproceedings{sennrich-etal-2017-university,
  title = {The University of {E}dinburgh{'}s Neural {MT} Systems for {WMT}17},
  author = {Sennrich, Rico and Birch, Alexandra and Currey, Anna and Germann, Ulrich and Haddow, Barry and Heafield, Kenneth and Miceli Barone, Antonio Valerio and Williams, Philip},
  booktitle = {Proceedings of the Second Conference on Machine Translation},
  month = sep,
  year = {2017},
  address = {Copenhagen, Denmark},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/W17-4739},
  doi = {10.18653/v1/W17-4739},
  pages = {389--399},
  month_numeric = {9}
}
@inproceedings{heafield-etal-2016-normalized,
  title = {Normalized Log-Linear Interpolation of Backoff Language Models is Efficient},
  author = {Heafield, Kenneth and Geigle, Chase and Massung, Sean and Schwartz, Lane},
  booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
  month = aug,
  year = {2016},
  address = {Berlin, Germany},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/P16-1083},
  doi = {10.18653/v1/P16-1083},
  pages = {876--886},
  month_numeric = {8}
}
@inproceedings{heafield-etal-2015-language,
  title = {Language Identification and Modeling in Specialized Hardware},
  author = {Heafield, Kenneth and Kshirsagar, Rohan and Barona, Santiago},
  booktitle = {Proceedings of the 53rd Annual Meeting of the Association for Computational Linguistics and the 7th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)},
  month = jul,
  year = {2015},
  address = {Beijing, China},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/P15-2063},
  doi = {10.3115/v1/P15-2063},
  pages = {384--389},
  month_numeric = {7}
}
@inproceedings{durrani-etal-2014-edinburghs,
  title = {{E}dinburgh{'}s Phrase-based Machine Translation Systems for {WMT}-14},
  author = {Durrani, Nadir and Haddow, Barry and Koehn, Philipp and Heafield, Kenneth},
  booktitle = {Proceedings of the Ninth Workshop on Statistical Machine Translation},
  month = jun,
  year = {2014},
  address = {Baltimore, Maryland, USA},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/W14-3309},
  doi = {10.3115/v1/W14-3309},
  pages = {97--104},
  month_numeric = {6}
}
@inproceedings{neidert-etal-2014-stanford,
  title = {{S}tanford University{'}s Submissions to the {WMT} 2014 Translation Task},
  author = {Neidert, Julia and Schuster, Sebastian and Green, Spence and Heafield, Kenneth and Manning, Christopher},
  booktitle = {Proceedings of the Ninth Workshop on Statistical Machine Translation},
  month = jun,
  year = {2014},
  address = {Baltimore, Maryland, USA},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/W14-3316},
  doi = {10.3115/v1/W14-3316},
  pages = {150--156},
  month_numeric = {6}
}
@inproceedings{heafield-etal-2014-faster,
  title = {Faster Phrase-Based Decoding by Refining Feature State},
  author = {Heafield, Kenneth and Kayser, Michael and Manning, Christopher D.},
  booktitle = {Proceedings of the 52nd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
  month = jun,
  year = {2014},
  address = {Baltimore, Maryland},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/P14-2022},
  doi = {10.3115/v1/P14-2022},
  pages = {130--135},
  month_numeric = {6}
}
@inproceedings{buck-etal-2014-n,
  title = {N-gram Counts and Language Models from the Common Crawl},
  author = {Buck, Christian and Heafield, Kenneth and van Ooyen, Bas},
  booktitle = {Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)},
  month = may,
  year = {2014},
  address = {Reykjavik, Iceland},
  publisher = {European Language Resources Association (ELRA)},
  url = {http://www.lrec-conf.org/proceedings/lrec2014/pdf/1097_Paper.pdf},
  pages = {3579--3584},
  abstract = {We contribute 5-gram counts and language models trained on the Common Crawl corpus, a collection over 9 billion web pages. This release improves upon the Google n-gram counts in two key ways: the inclusion of low-count entries and deduplication to reduce boilerplate. By preserving singletons, we were able to use Kneser-Ney smoothing to build large language models. This paper describes how the corpus was processed with emphasis on the problems that arise in working with data at this scale. Our unpruned Kneser-Ney English {\$}5{\$}-gram language model, built on 975 billion deduplicated tokens, contains over 500 billion unique n-grams. We show gains of 0.5-1.4 BLEU by using large language models to translate into various languages.},
  month_numeric = {5}
}
@phdthesis{Heafield-thesis,
  author = {Kenneth Heafield},
  title = {Efficient Language Modeling Algorithms with Applications to Statistical Machine Translation},
  year = {2013},
  month = sep,
  month_numeric = {9},
  day = {20},
  school = {{Carnegie} {Mellon} University},
  url = {https://kheafield.com/papers/thesis.pdf}
}
@inproceedings{durrani-etal-2013-edinburghs,
  title = {{E}dinburgh{'}s Machine Translation Systems for {E}uropean Language Pairs},
  author = {Durrani, Nadir and Haddow, Barry and Heafield, Kenneth and Koehn, Philipp},
  booktitle = {Proceedings of the Eighth Workshop on Statistical Machine Translation},
  month = aug,
  year = {2013},
  address = {Sofia, Bulgaria},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/W13-2212},
  pages = {114--121},
  month_numeric = {8}
}
@inproceedings{heafield-etal-2013-scalable,
  title = {Scalable Modified {K}neser-{N}ey Language Model Estimation},
  author = {Heafield, Kenneth and Pouzyrevsky, Ivan and Clark, Jonathan H. and Koehn, Philipp},
  booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)},
  month = aug,
  year = {2013},
  address = {Sofia, Bulgaria},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/P13-2121},
  pages = {690--696},
  month_numeric = {8}
}
@inproceedings{heafield-etal-2013-grouping,
  title = {Grouping Language Model Boundary Words to Speed K{--}Best Extraction from Hypergraphs},
  author = {Heafield, Kenneth and Koehn, Philipp and Lavie, Alon},
  booktitle = {Proceedings of the 2013 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies},
  month = jun,
  year = {2013},
  address = {Atlanta, Georgia},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/N13-1116},
  pages = {958--968},
  month_numeric = {6}
}
@inproceedings{heafield-etal-2012-language,
  title = {Language Model Rest Costs and Space-Efficient Storage},
  author = {Heafield, Kenneth and Koehn, Philipp and Lavie, Alon},
  booktitle = {Proceedings of the 2012 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning},
  month = jul,
  year = {2012},
  address = {Jeju Island, Korea},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/D12-1107},
  pages = {1169--1178},
  month_numeric = {7}
}
@Patent{Rama-mining-patent,
  author = {Girish Maskeri Rama and Kenneth Heafield and Santonu Sarkar},
  title = {Identification of Topics in Source Code},
  year = {2012},
  month = jun,
  month_numeric = {6},
  day = {26},
  location = {US},
  number = {US 8209665},
  yearfiled = {2009}
}
@inproceedings{Heafield-left,
  author = {Kenneth Heafield and Hieu Hoang and Philipp Koehn and Tetsuo Kiso and Marcello Federico},
  title = {Left Language Model State for Syntactic Machine Translation},
  year = {2011},
  month = dec,
  month_numeric = {12},
  booktitle = {Proceedings of the International Workshop on Spoken Language Translation},
  address = {San Francisco, California, USA},
  pages = {183--190},
  url = {https://kheafield.com/papers/edinburgh/left\_paper.pdf}
}
@inproceedings{heafield-2011-kenlm,
  title = {{K}en{LM}: Faster and Smaller Language Model Queries},
  author = {Heafield, Kenneth},
  booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},
  month = jul,
  year = {2011},
  address = {Edinburgh, Scotland},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/W11-2123},
  pages = {187--197},
  month_numeric = {7}
}
@inproceedings{heafield-lavie-2011-cmu,
  title = {{CMU} System Combination in {WMT} 2011},
  author = {Heafield, Kenneth and Lavie, Alon},
  booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},
  month = jul,
  year = {2011},
  address = {Edinburgh, Scotland},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/W11-2117},
  pages = {145--151},
  month_numeric = {7}
}
@Patent{Curtis-similar,
  author = {Taylor Curtis and Kenneth Heafield},
  title = {Systems and Methods for Identifying Similar Documents},
  year = {2011},
  month = jun,
  month_numeric = {6},
  day = {7},
  location = {US},
  number = {US 7958136},
  yearfiled = {2008}
}
@inproceedings{Heafield-voting,
  author = {Kenneth Heafield and Alon Lavie},
  title = {Voting on N-grams for Machine Translation System Combination},
  year = {2010},
  month = nov,
  month_numeric = {11},
  booktitle = {Proceedings of the Ninth Conference of the Association for Machine Translation in the Americas},
  address = {Denver, Colorado, USA},
  url = {https://kheafield.com/papers/avenue/amta2010.pdf}
}
@inproceedings{heafield-lavie-2010-cmu,
  title = {{CMU} Multi-Engine Machine Translation for {WMT} 2010},
  author = {Heafield, Kenneth and Lavie, Alon},
  booktitle = {Proceedings of the Joint Fifth Workshop on Statistical Machine Translation and {M}etrics{MATR}},
  month = jul,
  year = {2010},
  address = {Uppsala, Sweden},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/W10-1744},
  pages = {301--306},
  month_numeric = {7}
}
@article{Heafield-marathon,
  author = {Kenneth Heafield and Alon Lavie},
  title = {Combining Machine Translation Output with Open Source: The {Carnegie} {Mellon} Multi-Engine Machine Translation Scheme},
  year = {2010},
  month = jan,
  month_numeric = {1},
  journal = {The Prague Bulletin of Mathematical Linguistics},
  volume = {93},
  pages = {27--36},
  url = {https://kheafield.com/papers/avenue/marathon2010.pdf}
}
@article{Clark-loonybin,
  author = {Jonathan H. Clark and Jonathan Weese and Byung Gyu Ahn and Andreas Zollmann and Qin Gao and Kenneth Heafield and Alon Lavie},
  title = {The Machine Translation Toolpack for {LoonyBin:} Automated Management of Experimental Machine Translation {HyperWorkflows}},
  year = {2010},
  month = jan,
  month_numeric = {1},
  journal = {The Prague Bulletin of Mathematical Linguistics},
  volume = {93},
  pages = {117--126},
  url = {https://kheafield.com/papers/avenue/loonybin.pdf}
}
@inproceedings{Heafield-nist,
  author = {Kenneth Heafield},
  title = {CMU-StatXfer Group System Combination},
  year = {2009},
  month = sep,
  month_numeric = {9},
  day = {1},
  booktitle = {Proceedings of the NIST Open MT Workshop at MT Summit XII},
  address = {Ottawa, Canada}
}
@inproceedings{heafield-etal-2009-machine,
  title = {Machine Translation System Combination with Flexible Word Ordering},
  author = {Heafield, Kenneth and Hanneman, Greg and Lavie, Alon},
  booktitle = {Proceedings of the Fourth Workshop on Statistical Machine Translation},
  month = mar,
  year = {2009},
  address = {Athens, Greece},
  publisher = {Association for Computational Linguistics},
  url = {https://www.aclweb.org/anthology/W09-0408},
  pages = {56--60},
  month_numeric = {3}
}
@inproceedings{Maskeri-mining,
  author = {Girish Maskeri and Santonu Sarkar and Kenneth Heafield},
  title = {Mining Business Topics in Source Code using Latent {Dirichlet} Allocation},
  year = {2008},
  month = feb,
  month_numeric = {2},
  booktitle = {Proceedings of the 1st India Software Engineering Conference},
  address = {Hyderabad, India},
  pages = {113--120},
  url = {https://kheafield.com/papers/infosys/isec031-maskeri.pdf}
}
@inproceedings{Browne-RR,
  author = {Stanley Browne and Jonathan Wheatley and Barry Welsh and Mark Seibert and Kenneth Heafield and R. Michael Rich and the GALEX Science Team},
  title = {{RR} {Lyrae} Stars in the Far Ultraviolet: {GALEX} Observations Compared with Theoretical Predictions},
  year = {2006},
  month = jun,
  month_numeric = {6},
  booktitle = {Proceedings of the American Astronomical Society 207th Meeting},
  address = {Washington, DC, USA}
}
@article{Welsh-GUVV,
  author = {Barry Welsh and Johathan Wheatley and Kenneth Heafield and Mark Seibert and the GALEX Science Team},
  title = {The {GALEX} Ultraviolet Variability Catalog},
  year = {2005},
  journal = {The Astronomical Journal},
  volume = {130},
  pages = {825--831},
  url = {https://kheafield.com/papers/galex/accepted.pdf}
}
@inproceedings{Welsh-Flaring,
  author = {Barry Welsh and Jonathan Wheatley and Kenneth Heafield and Mark Seibert and Stanley Browne and the GALEX Science Team},
  title = {The Flaring {UV} Sky},
  year = {2005},
  month = jan,
  month_numeric = {1},
  booktitle = {Proceedings of the American Astronomical Society 205th Meeting},
  address = {San Diego, California, USA}
}