Bibliography

Shannon, Claude E.

A Mathematical Theory of Communication

The Bell System Technical Journal, 27(3), 379–423 (1948)

Core Ch 1 Ch 2

BibTeX

@article{shannon1948,
  author    = {Shannon, Claude E.},
  title     = {A Mathematical Theory of Communication},
  journal   = {The Bell System Technical Journal},
  volume    = {27},
  number    = {3},
  pages     = {379--423},
  year      = {1948}
}

Jelinek, Frederick and Mercer, Robert L.

Interpolated Estimation of Markov Source Parameters from Sparse Data

Proceedings of the Workshop on Pattern Recognition in Practice, 381–397 (1980)

Ch 3

BibTeX

@inproceedings{jelinek1980interpolated,
  author    = {Jelinek, Frederick and Mercer, Robert L.},
  title     = {Interpolated Estimation of {Markov} Source Parameters from Sparse Data},
  booktitle = {Proceedings of the Workshop on Pattern Recognition in Practice},
  pages     = {381--397},
  year      = {1980}
}

Elman, Jeffrey L.

Finding Structure in Time

Cognitive Science, 14(2), 179–211 (1990)

Ch 5

BibTeX

@article{elman1990finding,
  author    = {Elman, Jeffrey L.},
  title     = {Finding Structure in Time},
  journal   = {Cognitive Science},
  volume    = {14},
  number    = {2},
  pages     = {179--211},
  year      = {1990}
}

Hochreiter, Sepp and Schmidhuber, Jürgen

Long Short-Term Memory

Neural Computation, 9(8), 1735–1780 (1997)

Ch 5

BibTeX

@article{hochreiter1997long,
  author    = {Hochreiter, Sepp and Schmidhuber, J{\"u}rgen},
  title     = {Long Short-Term Memory},
  journal   = {Neural Computation},
  volume    = {9},
  number    = {8},
  pages     = {1735--1780},
  year      = {1997}
}

Chen, Stanley F. and Goodman, Joshua

An Empirical Study of Smoothing Techniques for Language Modeling

Computer Speech & Language, 13(4), 359–394 (1999)

Ch 3

BibTeX

@article{chen1999empirical,
  author    = {Chen, Stanley F. and Goodman, Joshua},
  title     = {An Empirical Study of Smoothing Techniques for Language Modeling},
  journal   = {Computer Speech \& Language},
  volume    = {13},
  number    = {4},
  pages     = {359--394},
  year      = {1999}
}

Bengio, Yoshua and Ducharme, Réjean and Vincent, Pascal and Jauvin, Christian

A Neural Probabilistic Language Model

Journal of Machine Learning Research, 3, 1137–1155 (2003)

Core Ch 1 Ch 4 Ch 5

BibTeX

@article{bengio2003,
  author    = {Bengio, Yoshua and Ducharme, R{\'e}jean and Vincent, Pascal and Jauvin, Christian},
  title     = {A Neural Probabilistic Language Model},
  journal   = {Journal of Machine Learning Research},
  volume    = {3},
  pages     = {1137--1155},
  year      = {2003}
}

Mikolov, Tomas and Karafiát, Martin and Burget, Lukáš and Černocký, Jan and Khudanpur, Sanjeev

Recurrent Neural Network based Language Model

Proceedings of INTERSPEECH, 1045–1048 (2010)

Ch 5

BibTeX

@inproceedings{mikolov2010recurrent,
  author    = {Mikolov, Tomas and Karafi{\'a}t, Martin and Burget, Luk{\'a}{\v{s}} and {\v{C}}ernock{\'y}, Jan and Khudanpur, Sanjeev},
  title     = {Recurrent Neural Network based Language Model},
  booktitle = {Proceedings of INTERSPEECH},
  pages     = {1045--1048},
  year      = {2010}
}

Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey

Efficient Estimation of Word Representations in Vector Space

Proceedings of ICLR Workshop (2013)

Core Ch 1 Ch 4

BibTeX

@inproceedings{mikolov2013efficient,
  author    = {Mikolov, Tomas and Chen, Kai and Corrado, Greg and Dean, Jeffrey},
  title     = {Efficient Estimation of Word Representations in Vector Space},
  booktitle = {Proceedings of ICLR Workshop},
  year      = {2013}
}

Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S. and Dean, Jeffrey

Distributed Representations of Words and Phrases and Their Compositionality

Advances in Neural Information Processing Systems (NeurIPS), 26, 3111–3119 (2013)

Core Ch 4

BibTeX

@inproceedings{mikolov2013distributed,
  author    = {Mikolov, Tomas and Sutskever, Ilya and Chen, Kai and Corrado, Greg S. and Dean, Jeffrey},
  title     = {Distributed Representations of Words and Phrases and Their Compositionality},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  volume    = {26},
  pages     = {3111--3119},
  year      = {2013}
}

Cho, Kyunghyun and van Merriënboer, Bart and Gulcehre, Caglar and Bahdanau, Dzmitry and others

Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation

Proceedings of EMNLP, 1724–1734 (2014)

Ch 5 Ch 6

BibTeX

@inproceedings{cho2014learning,
  author    = {Cho, Kyunghyun and van Merri{\"e}nboer, Bart and Gulcehre, Caglar and Bahdanau, Dzmitry and others},
  title     = {Learning Phrase Representations using {RNN} Encoder-Decoder for Statistical Machine Translation},
  booktitle = {Proceedings of EMNLP},
  pages     = {1724--1734},
  year      = {2014}
}

Pennington, Jeffrey and Socher, Richard and Manning, Christopher D.

GloVe: Global Vectors for Word Representation

Proceedings of EMNLP, 1532–1543 (2014)

Core Ch 4

BibTeX

@inproceedings{pennington2014,
  author    = {Pennington, Jeffrey and Socher, Richard and Manning, Christopher D.},
  title     = {{GloVe}: Global Vectors for Word Representation},
  booktitle = {Proceedings of EMNLP},
  pages     = {1532--1543},
  year      = {2014}
}

Sutskever, Ilya and Vinyals, Oriol and Le, Quoc V.

Sequence to Sequence Learning with Neural Networks

Advances in Neural Information Processing Systems (NeurIPS), 27, 3104–3112 (2014)

Ch 7

BibTeX

@inproceedings{sutskever2014sequence,
  author    = {Sutskever, Ilya and Vinyals, Oriol and Le, Quoc V.},
  title     = {Sequence to Sequence Learning with Neural Networks},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  volume    = {27},
  pages     = {3104--3112},
  year      = {2014}
}

Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua

Neural Machine Translation by Jointly Learning to Align and Translate

Proceedings of ICLR (2015)

Core Ch 1 Ch 6 Ch 7 Ch 8

BibTeX

@inproceedings{bahdanau2015,
  author    = {Bahdanau, Dzmitry and Cho, Kyunghyun and Bengio, Yoshua},
  title     = {Neural Machine Translation by Jointly Learning to Align and Translate},
  booktitle = {Proceedings of ICLR},
  year      = {2015}
}

Luong, Minh-Thang and Pham, Hieu and Manning, Christopher D.

Effective Approaches to Attention-based Neural Machine Translation

Proceedings of EMNLP, 1412–1421 (2015)

Core Ch 6 Ch 8

BibTeX

@inproceedings{luong2015,
  author    = {Luong, Minh-Thang and Pham, Hieu and Manning, Christopher D.},
  title     = {Effective Approaches to Attention-based Neural Machine Translation},
  booktitle = {Proceedings of EMNLP},
  pages     = {1412--1421},
  year      = {2015}
}

Sennrich, Rico and Haddow, Barry and Birch, Alexandra

Neural Machine Translation of Rare Words with Subword Units

Proceedings of ACL, 1715–1725 (2016)

Core Ch 10

BibTeX

@inproceedings{sennrich2016,
  author    = {Sennrich, Rico and Haddow, Barry and Birch, Alexandra},
  title     = {Neural Machine Translation of Rare Words with Subword Units},
  booktitle = {Proceedings of ACL},
  pages     = {1715--1725},
  year      = {2016}
}

Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas

Enriching Word Vectors with Subword Information

Transactions of the Association for Computational Linguistics (TACL), 5, 135–146 (2017)

Ch 4

BibTeX

@article{bojanowski2017enriching,
  author    = {Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
  title     = {Enriching Word Vectors with Subword Information},
  journal   = {Transactions of the Association for Computational Linguistics (TACL)},
  volume    = {5},
  pages     = {135--146},
  year      = {2017}
}

Shazeer, Noam and Mirhoseini, Azalia and Maciukiewicz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff

Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer

Proceedings of ICLR (2017)

Ch 11

BibTeX

@inproceedings{shazeer2017outrageously,
  author    = {Shazeer, Noam and Mirhoseini, Azalia and Maciukiewicz, Krzysztof and Davis, Andy and Le, Quoc and Hinton, Geoffrey and Dean, Jeff},
  title     = {Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer},
  booktitle = {Proceedings of ICLR},
  year      = {2017}
}

Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, Łukasz and Polosukhin, Illia

Attention Is All You Need

Advances in Neural Information Processing Systems (NeurIPS), 30, 5998–6008 (2017)

Core Ch 1 Ch 6 Ch 8

BibTeX

@inproceedings{vaswani2017,
  author    = {Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N. and Kaiser, {\L}ukasz and Polosukhin, Illia},
  title     = {Attention Is All You Need},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  volume    = {30},
  pages     = {5998--6008},
  year      = {2017}
}

Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya

Improving Language Understanding by Generative Pre-Training

OpenAI Technical Report (2018)

Core Ch 1 Ch 9

BibTeX

@article{radford2018,
  author    = {Radford, Alec and Narasimhan, Karthik and Salimans, Tim and Sutskever, Ilya},
  title     = {Improving Language Understanding by Generative Pre-Training},
  journal   = {OpenAI Technical Report},
  year      = {2018}
}

Clark, Kevin and Khandelwal, Urvashi and Levy, Omer and Manning, Christopher D.

What Does BERT Look At? An Analysis of BERT's Attention

Proceedings of ACL Workshop BlackboxNLP, 276–286 (2019)

Ch 6 Ch 8

BibTeX

@inproceedings{clark2019what,
  author    = {Clark, Kevin and Khandelwal, Urvashi and Levy, Omer and Manning, Christopher D.},
  title     = {What Does {BERT} Look At? {An} Analysis of {BERT}'s Attention},
  booktitle = {Proceedings of ACL Workshop BlackboxNLP},
  pages     = {276--286},
  year      = {2019}
}

Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina

BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding

Proceedings of NAACL-HLT, 4171–4186 (2019)

Core Ch 1 Ch 9

BibTeX

@inproceedings{devlin2019,
  author    = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
  title     = {{BERT}: Pre-training of Deep Bidirectional Transformers for Language Understanding},
  booktitle = {Proceedings of NAACL-HLT},
  pages     = {4171--4186},
  year      = {2019}
}

Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya

Language Models are Unsupervised Multitask Learners

OpenAI Technical Report (2019)

Core Ch 9 Ch 10

BibTeX

@article{radford2019,
  author    = {Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
  title     = {Language Models are Unsupervised Multitask Learners},
  journal   = {OpenAI Technical Report},
  year      = {2019}
}

Strubell, Emma and Ganesh, Ananya and McCallum, Andrew

Energy and Policy Considerations for Deep Learning in NLP

Proceedings of ACL, 3645–3650 (2019)

Ch 15

BibTeX

@inproceedings{strubell2019energy,
  author    = {Strubell, Emma and Ganesh, Ananya and McCallum, Andrew},
  title     = {Energy and Policy Considerations for Deep Learning in {NLP}},
  booktitle = {Proceedings of ACL},
  pages     = {3645--3650},
  year      = {2019}
}

Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D. and others

Language Models are Few-Shot Learners

Advances in Neural Information Processing Systems (NeurIPS), 33, 1877–1901 (2020)

Core Ch 1 Ch 9 Ch 13

BibTeX

@inproceedings{brown2020,
  author    = {Brown, Tom B. and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D. and others},
  title     = {Language Models are Few-Shot Learners},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  volume    = {33},
  pages     = {1877--1901},
  year      = {2020}
}

Holtzman, Ari and Buys, Jan and Du, Li and Forbes, Maxwell and Choi, Yejin

The Curious Case of Neural Text Degeneration

Proceedings of ICLR (2020)

Ch 7

BibTeX

@inproceedings{holtzman2020curious,
  author    = {Holtzman, Ari and Buys, Jan and Du, Li and Forbes, Maxwell and Choi, Yejin},
  title     = {The Curious Case of Neural Text Degeneration},
  booktitle = {Proceedings of ICLR},
  year      = {2020}
}

Kaplan, Jared and McCandlish, Sam and Henighan, Tom and Brown, Tom B. and Chess, Benjamin and others

Scaling Laws for Neural Language Models

arXiv preprint arXiv:2001.08361 (2020)

Core Ch 11

BibTeX

@article{kaplan2020,
  author    = {Kaplan, Jared and McCandlish, Sam and Henighan, Tom and Brown, Tom B. and Chess, Benjamin and others},
  title     = {Scaling Laws for Neural Language Models},
  journal   = {arXiv preprint arXiv:2001.08361},
  year      = {2020}
}

Lewis, Patrick and Perez, Ethan and Piktus, Aleksandra and Petroni, Fabio and others

Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks

Advances in Neural Information Processing Systems (NeurIPS), 33, 9459–9474 (2020)

Ch 14

BibTeX

@inproceedings{lewis2020retrieval,
  author    = {Lewis, Patrick and Perez, Ethan and Piktus, Aleksandra and Petroni, Fabio and others},
  title     = {Retrieval-Augmented Generation for Knowledge-Intensive {NLP} Tasks},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  volume    = {33},
  pages     = {9459--9474},
  year      = {2020}
}

Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and others

Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer

Journal of Machine Learning Research (JMLR), 21(140), 1–67 (2020)

Ch 9

BibTeX

@article{raffel2020exploring,
  author    = {Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and others},
  title     = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
  journal   = {Journal of Machine Learning Research (JMLR)},
  volume    = {21},
  number    = {140},
  pages     = {1--67},
  year      = {2020}
}

Askell, Amanda and Bai, Yuntao and Chen, Anna and Drain, Dawn and others

A General Language Assistant as a Laboratory for Alignment

arXiv preprint arXiv:2112.00861 (2021)

Ch 12

BibTeX

@article{askell2021general,
  author    = {Askell, Amanda and Bai, Yuntao and Chen, Anna and Drain, Dawn and others},
  title     = {A General Language Assistant as a Laboratory for Alignment},
  journal   = {arXiv preprint arXiv:2112.00861},
  year      = {2021}
}

Carlini, Nicholas and Tramer, Florian and Wallace, Eric and Jagielski, Matthew and others

Extracting Training Data from Large Language Models

Proceedings of the 30th USENIX Security Symposium, 2633–2650 (2021)

Ch 15

BibTeX

@inproceedings{carlini2021extracting,
  author    = {Carlini, Nicholas and Tramer, Florian and Wallace, Eric and Jagielski, Matthew and others},
  title     = {Extracting Training Data from Large Language Models},
  booktitle = {Proceedings of the 30th USENIX Security Symposium},
  pages     = {2633--2650},
  year      = {2021}
}

Patterson, David and Gonzalez, Joseph and Le, Quoc and Liang, Chen and others

Carbon Emissions and Large Neural Network Training

arXiv preprint arXiv:2104.10350 (2021)

Ch 15

BibTeX

@article{patterson2021carbon,
  author    = {Patterson, David and Gonzalez, Joseph and Le, Quoc and Liang, Chen and others},
  title     = {Carbon Emissions and Large Neural Network Training},
  journal   = {arXiv preprint arXiv:2104.10350},
  year      = {2021}
}

Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and others

Learning Transferable Visual Models From Natural Language Supervision

Proceedings of ICML, 8748–8763 (2021)

Ch 14

BibTeX

@inproceedings{radford2021learning,
  author    = {Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and others},
  title     = {Learning Transferable Visual Models From Natural Language Supervision},
  booktitle = {Proceedings of ICML},
  pages     = {8748--8763},
  year      = {2021}
}

Su, Jianlin and Lu, Yu and Pan, Shengfeng and Murtadha, Ahmed and Wen, Bo and Liu, Yunfeng

RoFormer: Enhanced Transformer with Rotary Position Embedding

arXiv preprint arXiv:2104.09864 (2021)

Ch 8 Ch 14

BibTeX

@article{su2021roformer,
  author    = {Su, Jianlin and Lu, Yu and Pan, Shengfeng and Murtadha, Ahmed and Wen, Bo and Liu, Yunfeng},
  title     = {{RoFormer}: Enhanced Transformer with Rotary Position Embedding},
  journal   = {arXiv preprint arXiv:2104.09864},
  year      = {2021}
}

Bai, Yuntao and Kadavath, Saurav and Kundu, Sandipan and Askell, Amanda and others

Constitutional AI: Harmlessness from AI Feedback

arXiv preprint arXiv:2212.08073 (2022)

Ch 12

BibTeX

@article{bai2022constitutional,
  author    = {Bai, Yuntao and Kadavath, Saurav and Kundu, Sandipan and Askell, Amanda and others},
  title     = {Constitutional {AI}: Harmlessness from {AI} Feedback},
  journal   = {arXiv preprint arXiv:2212.08073},
  year      = {2022}
}

Fedus, William and Zoph, Barret and Shazeer, Noam

Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity

Journal of Machine Learning Research (JMLR), 23(120), 1–39 (2022)

Ch 11

BibTeX

@article{fedus2022switch,
  author    = {Fedus, William and Zoph, Barret and Shazeer, Noam},
  title     = {Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity},
  journal   = {Journal of Machine Learning Research (JMLR)},
  volume    = {23},
  number    = {120},
  pages     = {1--39},
  year      = {2022}
}

Hoffmann, Jordan and Borgeaud, Sebastian and Mensch, Arthur and Buchatskaya, Elena and others

Training Compute-Optimal Large Language Models

Advances in Neural Information Processing Systems (NeurIPS), 35, 30016–30030 (2022)

Core Ch 11

BibTeX

@inproceedings{hoffmann2022,
  author    = {Hoffmann, Jordan and Borgeaud, Sebastian and Mensch, Arthur and Buchatskaya, Elena and others},
  title     = {Training Compute-Optimal Large Language Models},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  volume    = {35},
  pages     = {30016--30030},
  year      = {2022}
}

Hu, Edward J. and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu

LoRA: Low-Rank Adaptation of Large Language Models

Proceedings of ICLR (2022)

Core Ch 14

BibTeX

@inproceedings{hu2021,
  author    = {Hu, Edward J. and Shen, Yelong and Wallis, Phillip and Allen-Zhu, Zeyuan and Li, Yuanzhi and Wang, Shean and Wang, Lu and Chen, Weizhu},
  title     = {{LoRA}: Low-Rank Adaptation of Large Language Models},
  booktitle = {Proceedings of ICLR},
  year      = {2022}
}

Ouyang, Long and Wu, Jeffrey and Jiang, Xu and Almeida, Diogo and others

Training Language Models to Follow Instructions with Human Feedback

Advances in Neural Information Processing Systems (NeurIPS), 35, 27730–27744 (2022)

Core Ch 12

BibTeX

@inproceedings{ouyang2022,
  author    = {Ouyang, Long and Wu, Jeffrey and Jiang, Xu and Almeida, Diogo and others},
  title     = {Training Language Models to Follow Instructions with Human Feedback},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  volume    = {35},
  pages     = {27730--27744},
  year      = {2022}
}

Perez, Ethan and Ringer, Sam and others

Red Teaming Language Models with Language Models

Proceedings of EMNLP, 3419–3448 (2022)

Ch 12

BibTeX

@inproceedings{perez2022red,
  author    = {Perez, Ethan and Ringer, Sam and others},
  title     = {Red Teaming Language Models with Language Models},
  booktitle = {Proceedings of EMNLP},
  pages     = {3419--3448},
  year      = {2022}
}

Press, Ofir and Smith, Noah A. and Lewis, Mike

Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation

Proceedings of ICLR (2022)

Ch 8 Ch 14

BibTeX

@inproceedings{press2022train,
  author    = {Press, Ofir and Smith, Noah A. and Lewis, Mike},
  title     = {Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation},
  booktitle = {Proceedings of ICLR},
  year      = {2022}
}

Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and others

Chain-of-Thought Prompting Elicits Reasoning in Large Language Models

Advances in Neural Information Processing Systems (NeurIPS), 35, 24824–24837 (2022)

Core Ch 13

BibTeX

@inproceedings{wei2022,
  author    = {Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and others},
  title     = {Chain-of-Thought Prompting Elicits Reasoning in Large Language Models},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  volume    = {35},
  pages     = {24824--24837},
  year      = {2022}
}

Wei, Jason and Tay, Yi and Bommasani, Rishi and Raffel, Colin and others

Emergent Abilities of Large Language Models

Transactions on Machine Learning Research (TMLR) (2022)

Ch 11

BibTeX

@article{wei2022emergent,
  author    = {Wei, Jason and Tay, Yi and Bommasani, Rishi and Raffel, Colin and others},
  title     = {Emergent Abilities of Large Language Models},
  journal   = {Transactions on Machine Learning Research (TMLR)},
  year      = {2022}
}

Akyürek, Ekin and Schuurmans, Dale and Andreas, Jacob and Ma, Tengyu and Zhou, Denny

What Learning Algorithm Is In-Context Learning? Investigations with Linear Models

Proceedings of ICLR (2023)

Ch 13

BibTeX

@inproceedings{akyurek2023what,
  author    = {Aky{\"u}rek, Ekin and Schuurmans, Dale and Andreas, Jacob and Ma, Tengyu and Zhou, Denny},
  title     = {What Learning Algorithm Is In-Context Learning? {Investigations} with Linear Models},
  booktitle = {Proceedings of ICLR},
  year      = {2023}
}

Liu, Haotian and Li, Chunyuan and Wu, Qingyang and Lee, Yong Jae

Visual Instruction Tuning

Advances in Neural Information Processing Systems (NeurIPS), 36 (2023)

Ch 14

BibTeX

@inproceedings{liu2023visual,
  author    = {Liu, Haotian and Li, Chunyuan and Wu, Qingyang and Lee, Yong Jae},
  title     = {Visual Instruction Tuning},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  volume    = {36},
  year      = {2023}
}

Rafailov, Rafael and Sharma, Archit and Mitchell, Eric and Ermon, Stefano and Manning, Christopher D. and Finn, Chelsea

Direct Preference Optimization: Your Language Model Is Secretly a Reward Model

Advances in Neural Information Processing Systems (NeurIPS), 36 (2023)

Core Ch 12

BibTeX

@inproceedings{rafailov2023,
  author    = {Rafailov, Rafael and Sharma, Archit and Mitchell, Eric and Ermon, Stefano and Manning, Christopher D. and Finn, Chelsea},
  title     = {Direct Preference Optimization: Your Language Model Is Secretly a Reward Model},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  volume    = {36},
  year      = {2023}
}

Schaeffer, Rylan and Miranda, Brando and Koyejo, Sanmi

Are Emergent Abilities of Large Language Models a Mirage?

Advances in Neural Information Processing Systems (NeurIPS), 36 (2023)

Ch 11

BibTeX

@inproceedings{schaeffer2023emergent,
  author    = {Schaeffer, Rylan and Miranda, Brando and Koyejo, Sanmi},
  title     = {Are Emergent Abilities of Large Language Models a Mirage?},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  volume    = {36},
  year      = {2023}
}

Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and others

LLaMA: Open and Efficient Foundation Language Models

arXiv preprint arXiv:2302.13971 (2023)

Core Ch 11

BibTeX

@article{touvron2023,
  author    = {Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and others},
  title     = {{LLaMA}: Open and Efficient Foundation Language Models},
  journal   = {arXiv preprint arXiv:2302.13971},
  year      = {2023}
}

Wang, Xuezhi and Wei, Jason and Schuurmans, Dale and Le, Quoc and others

Self-Consistency Improves Chain of Thought Reasoning in Language Models

Proceedings of ICLR (2023)

Ch 13

BibTeX

@inproceedings{wang2023selfconsistency,
  author    = {Wang, Xuezhi and Wei, Jason and Schuurmans, Dale and Le, Quoc and others},
  title     = {Self-Consistency Improves Chain of Thought Reasoning in Language Models},
  booktitle = {Proceedings of ICLR},
  year      = {2023}
}

Yao, Shunyu and Zhao, Jeffrey and Yu, Dian and Du, Nan and others

ReAct: Synergizing Reasoning and Acting in Language Models

Proceedings of ICLR (2023)

Ch 14

BibTeX

@inproceedings{yao2023react,
  author    = {Yao, Shunyu and Zhao, Jeffrey and Yu, Dian and Du, Nan and others},
  title     = {{ReAct}: Synergizing Reasoning and Acting in Language Models},
  booktitle = {Proceedings of ICLR},
  year      = {2023}
}

Jiang, Albert Q. and Sablayrolles, Alexandre and Roux, Antoine and Mensch, Arthur and others

Mixtral of Experts

arXiv preprint arXiv:2401.04088 (2024)

Ch 11

BibTeX

@article{jiang2024mixtral,
  author    = {Jiang, Albert Q. and Sablayrolles, Alexandre and Roux, Antoine and Mensch, Arthur and others},
  title     = {Mixtral of Experts},
  journal   = {arXiv preprint arXiv:2401.04088},
  year      = {2024}
}

Publication Timeline