= read.csv("/filepath/fp.dta") fp
Getting Started
For now, all code examples in R and Stata will use a dataset on the Federalist Papers that can be found on this page. This dataset includes a host of different datatypes that can be manipulated to learn different coding techniques and tricks. This dataset is referred to as fp
throughout the site.
This dataset is a modifed version of the tidy_federalist.RDS
dataset I retrieved on June 27, 2025 from Levi Nicklas’ public GitHub Repository that houses data on the Federalist Papers. The modifications I made include organizing variables and observations in a way I found useful, filling in missing data, creating a few useful variables (e.g., duration between publication), and denoting disputed authorship of some papers (see Mosteller and Wallace 1963, 1964). Full code used to transform Nicklas’ original data into the fp
dataset appears in the Data Modifications Code section below.
Federalist Papers Data
A .csv file of the fp
dataset can be downloaded here (recommended for R users).
A .dta file of the fp
dataset can be downloaded here (recommended for Stata users).
Before each code chunk in the examples throughout this site, users should load in the fp
dataset.
To load in the .csv file of the fp
dataset in R, run the following code:
To load in the .dta file of the fp
dataset in Stata, run the following code:
"/filepath/fp.dta", clear use
Data Modifications Code
# dataset downloaded on 6/27/2025 from https://github.com/Levi-Nicklas/FederalistPapers/blob/main/scripts/Collect_and_CleanData.R
# dataset creator is Levi Nicklas (GitHub username is Levi-Nicklas)
# Nicklas acquired the text of the Federalist Papers used to create this dataset from Project Gutenberg, so the text is trustworthy
# load in Nicklas' dataset
= readRDS("~/Desktop/tidy_federalist.RDS")
fp
# rename some variables
names(fp)[names(fp) == "paper_num"] = "fedId"
names(fp)[names(fp) == "Author"] = "author"
# now I modify Nicklas' dataset to my liking by changing obs and changing/adding vars
# since all papers begin with "To the People of the State of New York", I extract all text before this to great the baseline for the title var
$title = gsub("To the People of the State of New York.*", "", fp$text)
fp# next, I remove everything before "To the People of the State of New York" to get each essay on its own
$text = gsub(".*(?=To the People of the State of New York)", "", fp$text, perl = TRUE)
fp# now I remove "To the People of the State of New York" to get just the text of the essay
# all except one paper end with a colon, the one ends with a period, so I include two expressions to capture this
$text = sub("To the People of the State of New York: |To the People of the State of New York. ", "", fp$text)
fp
# since each paper begins with the author name, the new title var will naturally include that name so I have to remove it
$title = gsub("HAMILTON|JAY|MADISON|MADISON, with HAMILTON", "", fp$title)
fp
# create a date variable
# will be useful for duration analysis!
# remove anything before a day of the week, since all the pubDates start that way
$pubDate = sub(".*(Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday) ?", "\\1", fp$title)
fp# remove the trailing period
$pubDate = sub("\\.\\s*$", "", fp$pubDate)
fp# an error in Project Gutenberg!
# says Fed 26 was published on Saturday, December 22, 1788, but it was actually 1787!
# same with Fed 30, was actually Friday, December 28, 1787
# fixing now
$pubDate = ifelse(fp$fedId == 26, "Saturday, December 22, 1787", fp$pubDate)
fp$pubDate = ifelse(fp$fedId == 30, "Friday, December 28, 1787", fp$pubDate)
fp# get in pubDate data format
$pubDate = as.Date(fp$pubDate, format = "%A, %B %d, %Y")
fp# calculate some durations
library(lubridate)
library(dplyr)
# days since previous paper
<- fp %>%
fp arrange(pubDate) %>% # Ensure rows are ordered by date
mutate(daysPrevFP = as.numeric(pubDate - lag(pubDate)))
# replace the one NA at the beginning with 0
$daysPrevFP = ifelse(is.na(fp$daysPrevFP), 0, fp$daysPrevFP)
fp# days since the end of the Constitutional Convertion (1787-09-17)
$daysConv = round(difftime(fp$pubDate, "1787-09-17"))
fp# days since the publication of Cato 1 (1787-09-27)
$daysCato1 = round(difftime(fp$pubDate, "1787-09-27"))
fp# days since the publication of Brutas 1 (1787-10-18)
$daysBrutus1 = round(difftime(fp$pubDate, "1787-10-18"))
fp
# there were only four publications for the papers, so I can just use those to create the var
# create publication variable
# seems like the words "For the" and "From the" and "From" appear before each publication, using that to get what I want
$publication = sub(".*(Daily Advertiser|Independent Journal|New York Packet|McLEAN's Edition, New York).*", ignore.case = TRUE, "\\1", fp$title)
fp# inconsistency in one publication, appears in original text as "MCLEAN's" or "McLEAN's" so I standardize to the one the appears more often
$publication = sub("MCLEAN's", "McLEAN's", fp$publication)
fp
# now I can finally correct the titles
# it appears that the name of the publication appears after the words "For the" and "From the"
# first I remove all text after the publication name and then after those words
$title = sub("(Daily Advertiser|Independent Journal|New York Packet|McLEAN's Edition, New York).*", "", ignore.case = TRUE, fp$title)
fp# then I do the fors and froms, making sure I don't mess with Fed. 45 that has it in the middle
$title = ifelse(fp$fedId != 45, sub("(For the|From the).*", "", ignore.case = TRUE, fp$title), fp$title)
fp# fix Fed. 45
$title = ifelse(fp$fedId == 45, sub("Considered For the ", "", ignore.case = TRUE, fp$title), fp$title)
fp# titles for a few papers end with "from" in a leftover to indicate publication, but titles in other papers also contain the word
# I just remove the word from the titles that end with "from"
$title <- ifelse(fp$fedId %in% 78:85, sub("from", "", ignore.case = TRUE, fp$title), fp$title)
fp# remove the trailing period
$title = sub("\\.\\s*$", "", fp$title)
fp
# have only the first letter of each author's name be capitalized
$author = paste(toupper(substr(fp$author, 1, 1)), tolower(substr(fp$author, 2, nchar(fp$author))), sep="")
fp
# the origianl dataset lists Fed. 18, 19, 20 obs for the author var as "Multiple" but I want to be more specific
# so I fix these obs of because it should be "Madison, with Hamilton" according to Project Gutenberg
$author = ifelse(fp$author == "Multiple", "Madison, with Hamilton", fp$author)
fp
# Project Gutenberg attributes each paper to Hamilton, Jay, Madison, or Madison, with Hamilton
# several papers have disputed authorship
# according to Mosteller and Wallace's 1963 article and 1964 book, Fed. 49-58, and 62-63 are disputed
# I modify the obs for these papers to reflect this, even though multiple scholars have attributed authorship
$author = ifelse(fp$fedId %in% c(49:58, 62, 63), "DISPUTED", fp$author)
fp# Fed. 18, 19, and 20 are potentially disputed, but not to the degree that the other noted papers are, so I ignore this possibility for now
# organize the variables
= subset(fp, select = c(fedId, author, title, pubDate, publication, text, daysPrevFP, daysConv, daysCato1, daysBrutus1))
fp
# save the data for R
write.csv(fp, "~/Desktop/fp.csv", row.names = F)
# save the data for Stata
::write_dta(fp, "~/Desktop/fp.dta") haven
References
Mosteller, Frederick and David L. Wallace. 1963. “Inference in an authorship problem: A comparative study of discrimination methods applied to the authorship of the disputed Federalist Papers.” Journal of the American Statistical Association 58(302) 275-309.
Mosteller, Frederick and David L. Wallace. 1964. Inference and disputed authorship: The Federalist. Addison-Wesley.
Nicklas, Levi. 2020. “FederalistPapers.” GitHub repository. https://github.com/Levi-Nicklas/FederalistPapers