Bibliography
50 papers spanning 1948–2024
No papers match the current filters.
A Mathematical Theory of Communication
The Bell System Technical Journal, 27(3), 379–423 (1948)
BibTeX
@article{shannon1948,
author = {Shannon, Claude E.},
title = {A Mathematical Theory of Communication},
journal = {The Bell System Technical Journal},
volume = {27},
number = {3},
pages = {379--423},
year = {1948}
}Interpolated Estimation of Markov Source Parameters from Sparse Data
Proceedings of the Workshop on Pattern Recognition in Practice, 381–397 (1980)
BibTeX
@inproceedings{jelinek1980interpolated,
author = {Jelinek, Frederick and Mercer, Robert L.},
title = {Interpolated Estimation of {Markov} Source Parameters from Sparse Data},
booktitle = {Proceedings of the Workshop on Pattern Recognition in Practice},
pages = {381--397},
year = {1980}
}Finding Structure in Time
Cognitive Science, 14(2), 179–211 (1990)
BibTeX
@article{elman1990finding,
author = {Elman, Jeffrey L.},
title = {Finding Structure in Time},
journal = {Cognitive Science},
volume = {14},
number = {2},
pages = {179--211},
year = {1990}
}Long Short-Term Memory
Neural Computation, 9(8), 1735–1780 (1997)
BibTeX
@article{hochreiter1997long,
author = {Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
title = {Long Short-Term Memory},
journal = {Neural Computation},
volume = {9},
number = {8},
pages = {1735--1780},
year = {1997}
}An Empirical Study of Smoothing Techniques for Language Modeling
Computer Speech & Language, 13(4), 359–394 (1999)
BibTeX
@article{chen1999empirical,
author = {Chen, Stanley F. and Goodman, Joshua},
title = {An Empirical Study of Smoothing Techniques for Language Modeling},
journal = {Computer Speech \& Language},
volume = {13},
number = {4},
pages = {359--394},
year = {1999}
}A Neural Probabilistic Language Model
Journal of Machine Learning Research, 3, 1137–1155 (2003)
BibTeX
@article{bengio2003,
author = {Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Jauvin, Christian},
title = {A Neural Probabilistic Language Model},
journal = {Journal of Machine Learning Research},
volume = {3},
pages = {1137--1155},
year = {2003}
}Recurrent Neural Network based Language Model
Proceedings of INTERSPEECH, 1045–1048 (2010)
BibTeX
@inproceedings{mikolov2010recurrent,
author = {Mikolov, Tomas and Karafi{\'a}t, Martin and Burget, Luk{\'a}{\v{s}} and {\v{C}}ernock{\'y}, Jan and Khudanpur, Sanjeev},
title = {Recurrent Neural Network based Language Model},
booktitle = {Proceedings of INTERSPEECH},
pages = {1045--1048},
year = {2010}
}Efficient Estimation of Word Representations in Vector Space
Proceedings of ICLR Workshop (2013)
BibTeX
@inproceedings{mikolov2013efficient,
author = {Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey},
title = {Efficient Estimation of Word Representations in Vector Space},
booktitle = {Proceedings of ICLR Workshop},
year = {2013}
}Distributed Representations of Words and Phrases and Their Compositionality
Advances in Neural Information Processing Systems (NeurIPS), 26, 3111–3119 (2013)
BibTeX
@inproceedings{mikolov2013distributed,
author = {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S. and Dean, Jeffrey},
title = {Distributed Representations of Words and Phrases and Their Compositionality},
booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
volume = {26},
pages = {3111--3119},
year = {2013}
}Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation
Proceedings of EMNLP, 1724–1734 (2014)
BibTeX
@inproceedings{cho2014learning,
author = {Cho, Kyunghyun and van Merri{\"e}nboer, Bart and Gulcehre, Caglar and Bahdanau, Dzmitry and others},
title = {Learning Phrase Representations using {RNN} Encoder-Decoder for Statistical Machine Translation},
booktitle = {Proceedings of EMNLP},
pages = {1724--1734},
year = {2014}
}GloVe: Global Vectors for Word Representation
Proceedings of EMNLP, 1532–1543 (2014)
BibTeX
@inproceedings{pennington2014,
author = {Pennington, Jeffrey and Socher, Richard and Manning, Christopher D.},
title = {{GloVe}: Global Vectors for Word Representation},
booktitle = {Proceedings of EMNLP},
pages = {1532--1543},
year = {2014}
}Sequence to Sequence Learning with Neural Networks
Advances in Neural Information Processing Systems (NeurIPS), 27, 3104–3112 (2014)
BibTeX
@inproceedings{sutskever2014sequence,
author = {Sutskever, Ilya and Vinyals, Oriol and Le, Quoc V.},
title = {Sequence to Sequence Learning with Neural Networks},
booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
volume = {27},
pages = {3104--3112},
year = {2014}
}Neural Machine Translation by Jointly Learning to Align and Translate
Proceedings of ICLR (2015)
BibTeX
@inproceedings{bahdanau2015,
author = {Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
title = {Neural Machine Translation by Jointly Learning to Align and Translate},
booktitle = {Proceedings of ICLR},
year = {2015}
}Effective Approaches to Attention-based Neural Machine Translation
Proceedings of EMNLP, 1412–1421 (2015)
BibTeX
@inproceedings{luong2015,
author = {Luong, Minh-Thang and Pham, Hieu and Manning, Christopher D.},
title = {Effective Approaches to Attention-based Neural Machine Translation},
booktitle = {Proceedings of EMNLP},
pages = {1412--1421},
year = {2015}
}Neural Machine Translation of Rare Words with Subword Units
Proceedings of ACL, 1715–1725 (2016)
BibTeX
@inproceedings{sennrich2016,
author = {Sennrich, Rico and Haddow, Barry and Birch, Alexandra},
title = {Neural Machine Translation of Rare Words with Subword Units},
booktitle = {Proceedings of ACL},
pages = {1715--1725},
year = {2016}
}Enriching Word Vectors with Subword Information
Transactions of the Association for Computational Linguistics (TACL), 5, 135–146 (2017)
BibTeX
@article{bojanowski2017enriching,
author = {Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
title = {Enriching Word Vectors with Subword Information},
journal = {Transactions of the Association for Computational Linguistics (TACL)},
volume = {5},
pages = {135--146},
year = {2017}
}Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer
Proceedings of ICLR (2017)
BibTeX
@inproceedings{shazeer2017outrageously,
author = {Shazeer, Noam and Mirhoseini, Azalia and Maciukiewicz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff},
title = {Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer},
booktitle = {Proceedings of ICLR},
year = {2017}
}Attention Is All You Need
Advances in Neural Information Processing Systems (NeurIPS), 30, 5998–6008 (2017)
BibTeX
@inproceedings{vaswani2017,
author = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, {\L}ukasz and Polosukhin, Illia},
title = {Attention Is All You Need},
booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
volume = {30},
pages = {5998--6008},
year = {2017}
}Improving Language Understanding by Generative Pre-Training
OpenAI Technical Report (2018)
BibTeX
@article{radford2018,
author = {Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya},
title = {Improving Language Understanding by Generative Pre-Training},
journal = {OpenAI Technical Report},
year = {2018}
}What Does BERT Look At? An Analysis of BERT's Attention
Proceedings of ACL Workshop BlackboxNLP, 276–286 (2019)
BibTeX
@inproceedings{clark2019what,
author = {Clark, Kevin and Khandelwal, Urvashi and Levy, Omer and Manning, Christopher D.},
title = {What Does {BERT} Look At? {An} Analysis of {BERT}'s Attention},
booktitle = {Proceedings of ACL Workshop BlackboxNLP},
pages = {276--286},
year = {2019}
}BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
Proceedings of NAACL-HLT, 4171–4186 (2019)
BibTeX
@inproceedings{devlin2019,
author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
title = {{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding},
booktitle = {Proceedings of NAACL-HLT},
pages = {4171--4186},
year = {2019}
}Language Models are Unsupervised Multitask Learners
OpenAI Technical Report (2019)
BibTeX
@article{radford2019,
author = {Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
title = {Language Models are Unsupervised Multitask Learners},
journal = {OpenAI Technical Report},
year = {2019}
}Energy and Policy Considerations for Deep Learning in NLP
Proceedings of ACL, 3645–3650 (2019)
BibTeX
@inproceedings{strubell2019energy,
author = {Strubell, Emma and Ganesh, Ananya and McCallum, Andrew},
title = {Energy and Policy Considerations for Deep Learning in {NLP}},
booktitle = {Proceedings of ACL},
pages = {3645--3650},
year = {2019}
}Language Models are Few-Shot Learners
Advances in Neural Information Processing Systems (NeurIPS), 33, 1877–1901 (2020)
BibTeX
@inproceedings{brown2020,
author = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D. and others},
title = {Language Models are Few-Shot Learners},
booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
volume = {33},
pages = {1877--1901},
year = {2020}
}The Curious Case of Neural Text Degeneration
Proceedings of ICLR (2020)
BibTeX
@inproceedings{holtzman2020curious,
author = {Holtzman, Ari and Buys, Jan and Du, Li and Forbes, Maxwell and Choi, Yejin},
title = {The Curious Case of Neural Text Degeneration},
booktitle = {Proceedings of ICLR},
year = {2020}
}Scaling Laws for Neural Language Models
arXiv preprint arXiv:2001.08361 (2020)
BibTeX
@article{kaplan2020,
author = {Kaplan, Jared and McCandlish, Sam and Henighan, Tom and Brown, Tom B. and Chess, Benjamin and others},
title = {Scaling Laws for Neural Language Models},
journal = {arXiv preprint arXiv:2001.08361},
year = {2020}
}Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks
Advances in Neural Information Processing Systems (NeurIPS), 33, 9459–9474 (2020)
BibTeX
@inproceedings{lewis2020retrieval,
author = {Lewis, Patrick and Perez, Ethan and Piktus, Aleksandra and Petroni, Fabio and others},
title = {Retrieval-Augmented Generation for Knowledge-Intensive {NLP} Tasks},
booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
volume = {33},
pages = {9459--9474},
year = {2020}
}Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer
Journal of Machine Learning Research (JMLR), 21(140), 1–67 (2020)
BibTeX
@article{raffel2020exploring,
author = {Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and others},
title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
journal = {Journal of Machine Learning Research (JMLR)},
volume = {21},
number = {140},
pages = {1--67},
year = {2020}
}A General Language Assistant as a Laboratory for Alignment
arXiv preprint arXiv:2112.00861 (2021)
BibTeX
@article{askell2021general,
author = {Askell, Amanda and Bai, Yuntao and Chen, Anna and Drain, Dawn and others},
title = {A General Language Assistant as a Laboratory for Alignment},
journal = {arXiv preprint arXiv:2112.00861},
year = {2021}
}Extracting Training Data from Large Language Models
Proceedings of the 30th USENIX Security Symposium, 2633–2650 (2021)
BibTeX
@inproceedings{carlini2021extracting,
author = {Carlini, Nicholas and Tramer, Florian and Wallace, Eric and Jagielski, Matthew and others},
title = {Extracting Training Data from Large Language Models},
booktitle = {Proceedings of the 30th USENIX Security Symposium},
pages = {2633--2650},
year = {2021}
}Carbon Emissions and Large Neural Network Training
arXiv preprint arXiv:2104.10350 (2021)
BibTeX
@article{patterson2021carbon,
author = {Patterson, David and Gonzalez, Joseph and Le, Quoc and Liang, Chen and others},
title = {Carbon Emissions and Large Neural Network Training},
journal = {arXiv preprint arXiv:2104.10350},
year = {2021}
}Learning Transferable Visual Models From Natural Language Supervision
Proceedings of ICML, 8748–8763 (2021)
BibTeX
@inproceedings{radford2021learning,
author = {Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and others},
title = {Learning Transferable Visual Models From Natural Language Supervision},
booktitle = {Proceedings of ICML},
pages = {8748--8763},
year = {2021}
}RoFormer: Enhanced Transformer with Rotary Position Embedding
arXiv preprint arXiv:2104.09864 (2021)
BibTeX
@article{su2021roformer,
author = {Su, Jianlin and Lu, Yu and Pan, Shengfeng and Murtadha, Ahmed and Wen, Bo and Liu, Yunfeng},
title = {{RoFormer}: Enhanced Transformer with Rotary Position Embedding},
journal = {arXiv preprint arXiv:2104.09864},
year = {2021}
}Constitutional AI: Harmlessness from AI Feedback
arXiv preprint arXiv:2212.08073 (2022)
BibTeX
@article{bai2022constitutional,
author = {Bai, Yuntao and Kadavath, Saurav and Kundu, Sandipan and Askell, Amanda and others},
title = {Constitutional {AI}: Harmlessness from {AI} Feedback},
journal = {arXiv preprint arXiv:2212.08073},
year = {2022}
}Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity
Journal of Machine Learning Research (JMLR), 23(120), 1–39 (2022)
BibTeX
@article{fedus2022switch,
author = {Fedus, William and Zoph, Barret and Shazeer, Noam},
title = {Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity},
journal = {Journal of Machine Learning Research (JMLR)},
volume = {23},
number = {120},
pages = {1--39},
year = {2022}
}Training Compute-Optimal Large Language Models
Advances in Neural Information Processing Systems (NeurIPS), 35, 30016–30030 (2022)
BibTeX
@inproceedings{hoffmann2022,
author = {Hoffmann, Jordan and Borgeaud, Sebastian and Mensch, Arthur and Buchatskaya, Elena and others},
title = {Training Compute-Optimal Large Language Models},
booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
volume = {35},
pages = {30016--30030},
year = {2022}
}LoRA: Low-Rank Adaptation of Large Language Models
Proceedings of ICLR (2022)
BibTeX
@inproceedings{hu2021,
author = {Hu, Edward J. and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
title = {{LoRA}: Low-Rank Adaptation of Large Language Models},
booktitle = {Proceedings of ICLR},
year = {2022}
}Training Language Models to Follow Instructions with Human Feedback
Advances in Neural Information Processing Systems (NeurIPS), 35, 27730–27744 (2022)
BibTeX
@inproceedings{ouyang2022,
author = {Ouyang, Long and Wu, Jeffrey and Jiang, Xu and Almeida, Diogo and others},
title = {Training Language Models to Follow Instructions with Human Feedback},
booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
volume = {35},
pages = {27730--27744},
year = {2022}
}Red Teaming Language Models with Language Models
Proceedings of EMNLP, 3419–3448 (2022)
BibTeX
@inproceedings{perez2022red,
author = {Perez, Ethan and Ringer, Sam and others},
title = {Red Teaming Language Models with Language Models},
booktitle = {Proceedings of EMNLP},
pages = {3419--3448},
year = {2022}
}Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation
Proceedings of ICLR (2022)
BibTeX
@inproceedings{press2022train,
author = {Press, Ofir and Smith, Noah A. and Lewis, Mike},
title = {Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation},
booktitle = {Proceedings of ICLR},
year = {2022}
}Chain-of-Thought Prompting Elicits Reasoning in Large Language Models
Advances in Neural Information Processing Systems (NeurIPS), 35, 24824–24837 (2022)
BibTeX
@inproceedings{wei2022,
author = {Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and others},
title = {Chain-of-Thought Prompting Elicits Reasoning in Large Language Models},
booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
volume = {35},
pages = {24824--24837},
year = {2022}
}Emergent Abilities of Large Language Models
Transactions on Machine Learning Research (TMLR) (2022)
BibTeX
@article{wei2022emergent,
author = {Wei, Jason and Tay, Yi and Bommasani, Rishi and Raffel, Colin and others},
title = {Emergent Abilities of Large Language Models},
journal = {Transactions on Machine Learning Research (TMLR)},
year = {2022}
}What Learning Algorithm Is In-Context Learning? Investigations with Linear Models
Proceedings of ICLR (2023)
BibTeX
@inproceedings{akyurek2023what,
author = {Aky{\"u}rek, Ekin and Schuurmans, Dale and Andreas, Jacob and Ma, Tengyu and Zhou, Denny},
title = {What Learning Algorithm Is In-Context Learning? {Investigations} with Linear Models},
booktitle = {Proceedings of ICLR},
year = {2023}
}Visual Instruction Tuning
Advances in Neural Information Processing Systems (NeurIPS), 36 (2023)
BibTeX
@inproceedings{liu2023visual,
author = {Liu, Haotian and Li, Chunyuan and Wu, Qingyang and Lee, Yong Jae},
title = {Visual Instruction Tuning},
booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
volume = {36},
year = {2023}
}Direct Preference Optimization: Your Language Model Is Secretly a Reward Model
Advances in Neural Information Processing Systems (NeurIPS), 36 (2023)
BibTeX
@inproceedings{rafailov2023,
author = {Rafailov, Rafael and Sharma, Archit and Mitchell, Eric and Ermon, Stefano and Manning, Christopher D. and Finn, Chelsea},
title = {Direct Preference Optimization: Your Language Model Is Secretly a Reward Model},
booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
volume = {36},
year = {2023}
}Are Emergent Abilities of Large Language Models a Mirage?
Advances in Neural Information Processing Systems (NeurIPS), 36 (2023)
BibTeX
@inproceedings{schaeffer2023emergent,
author = {Schaeffer, Rylan and Miranda, Brando and Koyejo, Sanmi},
title = {Are Emergent Abilities of Large Language Models a Mirage?},
booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
volume = {36},
year = {2023}
}LLaMA: Open and Efficient Foundation Language Models
arXiv preprint arXiv:2302.13971 (2023)
BibTeX
@article{touvron2023,
author = {Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and others},
title = {{LLaMA}: Open and Efficient Foundation Language Models},
journal = {arXiv preprint arXiv:2302.13971},
year = {2023}
}Self-Consistency Improves Chain of Thought Reasoning in Language Models
Proceedings of ICLR (2023)
BibTeX
@inproceedings{wang2023selfconsistency,
author = {Wang, Xuezhi and Wei, Jason and Schuurmans, Dale and Le, Quoc and others},
title = {Self-Consistency Improves Chain of Thought Reasoning in Language Models},
booktitle = {Proceedings of ICLR},
year = {2023}
}ReAct: Synergizing Reasoning and Acting in Language Models
Proceedings of ICLR (2023)
BibTeX
@inproceedings{yao2023react,
author = {Yao, Shunyu and Zhao, Jeffrey and Yu, Dian and Du, Nan and others},
title = {{ReAct}: Synergizing Reasoning and Acting in Language Models},
booktitle = {Proceedings of ICLR},
year = {2023}
}Mixtral of Experts
arXiv preprint arXiv:2401.04088 (2024)
BibTeX
@article{jiang2024mixtral,
author = {Jiang, Albert Q. and Sablayrolles, Alexandre and Roux, Antoine and Mensch, Arthur and others},
title = {Mixtral of Experts},
journal = {arXiv preprint arXiv:2401.04088},
year = {2024}
}Publication Timeline
Dot size reflects number of papers per year.