Practicum – Jeremy Gerdes (CODE on GitHub)

I have selected a public data source of meta data of all document in the ADAMS public library found on the Nuclear Regulatory Commission’s site.

Get the meta data with: (this took roughly 6 hours to pull down and ingest row by row into a data set of 1.25 million records)

#get.nrc.metadata.R"
download.nrc.document.meta <- function(days,data.directory,fGetHourly=FALSE){
  # days <- seq(from=as.Date("12/23/2004",format="%m/%d/%Y"), to=as.Date(Sys.Date()+1),by='days' )
  for(intDay in seq_along(days)){
    # U.S. Nuclear Regulatory Commission public documents
    # See page 16 of https://www.nrc.gov/site-help/developers/wba-api-developer-guide.pdf
    # Find all reports by date range to the NRC’s public library 
    strMonth <- format(days[intDay],"%m")
    strYear <- format(days[intDay],"%y")
    nrc.var.name <- paste0("nrc.document.",format(days[intDay],"%Y_%m_%d"))
    nrc.xml.filename <- paste0(file.path(data.directory,nrc.var.name,".xml"))
    nrc.rdata.filename <-paste0(file.path(data.directory,nrc.var.name,".Rdata"))
    # Resume where we left off
    if (file.exists(nrc.rdata.filename)){
      print(paste0("Loading:",nrc.rdata.filename))
      load(nrc.rdata.filename)
      if (exists("nrc.rows")){
        if(is.null(nrc.rows)){
          rm("nrc.rows")
        }
      }
    } else {
      if (intDay == 1){
        query.dates <- paste0("(left:%2701/01/1954+12:00+AM%27,",
                              "right:%27",format(days[intDay],"%m/%d/%Y"),"+12:00+AM%27),")
      } else{
        query.dates <- paste0("(left:%27",format(days[intDay],"%m/%d/%Y"),"+12:00+AM%27,",
                              "right:%27",format(days[intDay+1],"%m/%d/%Y"),"+12:00+AM%27),")
      }
      xml2::download_xml(
        paste0(
          "https://adams.nrc.gov/wba/services/search/advanced/nrc?q=",
          "(mode:sections,",
          "sections:(filters:(public-library:!t),",
          "options:(within-folder:(enable:!f,insubfolder:!f,path:%27%27)),",
          "properties_search_all:!(",
          "!(PublishDatePARS,range,",
          query.dates,
          "%27%27))))&",
          "qn=New&tab=advanced-search-pars&s=%24title&so=ASC"
        ),
        file=nrc.xml.filename
      )
      nrc.document.raw <- xml2::read_xml(nrc.xml.filename)
      file.remove(nrc.xml.filename)
      nrc.document.list <- xml2::as_list(nrc.document.raw)
      nrc.document.result.list <- xml2::xml_find_all(nrc.document.raw,"//result")
      getNrcDateAsPosix <- function(date.character,time.zone = Sys.timezone()){
        lubridate::as_datetime(
          timeDate::strptimeDate(
            as.character(date.character),
            format="%Y%m%d%H%M%S"
            ,tz=time.zone
          )
        )
      }
      if (nrc.document.list$search$count==1000){
        if (exists("error.days")){
          error.days <- rbind(error.days,days[intDay])
        }else{
          error.days <- matrix(days[intDay])
        }
        warning(paste0(format(days[intDay],"%y_%m_%d"),"-Exceeds Max Query Return Value of 1000"),call. = TRUE,immediate. = TRUE,noBreaks. = TRUE)
        print(error.days[length(error.days)])
      }
      if(nrc.document.list$search$count == 0){
        warning(paste0(format(days[intDay],"%y_%m_%d"),"-",nrc.document.list$search$matches)
                ,call. = TRUE,immediate. = TRUE,noBreaks. = TRUE)
      } else {
        print(paste0("Attempting to import: ",nrc.document.list$search$count," rows for date: ", days[intDay] ," at:",Sys.time()))
        nrc.document <-nrc.document.list$search$resultset
        # the aow by agonizing row method (need to figure out the functional programming paradigm here)
        #print("Row by row")
        for (intRow in 1:length(nrc.document)){
          tmp.column.count <- length(nrc.document[[intRow]])
          tmp.row <- matrix(unlist(nrc.document[[intRow]][1:tmp.column.count]), ncol=tmp.column.count)
          colnames(tmp.row)<-names(nrc.document[[intRow]])
          #print("binding rows")
          if (exists("nrc.rows")){
            # nrc.rows <-nrc.rows %>% add_row(as.data.frame(nrc.columns))  
            nrc.rows <- gtools::smartbind(nrc.rows,tmp.row)
          } else {
            nrc.rows <- tmp.row
          }
        }
      }
      #print("Done building rows")
      if(exists("nrc.rows")){
        if(length(nrc.rows >0)){
          rownames(nrc.rows)<-1:length(nrc.rows[,1])
          save(nrc.rows,file=paste0(nrc.var.name,".Rdata"))
        }else{
          nrc.rows <- NULL
          save(nrc.rows,file=paste0(nrc.var.name,".Rdata"))
          rm("nrc.rows")
        }
      } else {
        nrc.rows <- NULL
        save(nrc.rows,file=paste0(nrc.var.name,".Rdata"))
        rm("nrc.rows")
      }
    }
    if(exists("nrc.rows")){
      if ((!exists("nrc.document.results"))|is.null(nrc.document.results)){
        nrc.document.results <-nrc.rows
      } else {
        nrc.document.results <- gtools::smartbind(nrc.document.results,nrc.rows)
      }
      rm("nrc.rows")
    }
  }
  if (exists("error.days")){
    save(error.days,file=paste0("errors.",nrc.document.results.filename))
  }
  save(nrc.document.results,file=nrc.document.results.filename)
}

Called with:

# Import only!!!
# rm(list=ls()) 
if ("rstudioapi" %in% rownames(installed.packages())){
  project.wd <-file.path(dirname(rstudioapi::getActiveDocumentContext()$path),"nrc.documents")
} else { # not running from Rstudio, use a common path
  if (.Platform$OS.type == "windows") { 
    project.wd <- file.path(Sys.getenv("LOCALAPPDATA"),"nrc.documents")
  } else {
    project.wd <- file.path(Sys.getenv("HOME"),"nrc.documents")
  }
}
if (!dir.exists(project.wd)){
  dir.create(project.wd,recursive = TRUE)
}
setwd(project.wd)
nrc.document.results.filename <- "nrc.document.data.frame.Rdata"
# Only import if the results file doesn't exist
if (file.exists(nrc.document.results.filename)){
  load(nrc.document.results.filename)  
} else if(file.exists("../Results/FinalDownload.Rdata")) {
  load("../Results/FinalDownload.Rdata") 
} else {
  # Downloading XML and ingesting the data into a dataframe for every day in the database at a rate of 5 years per 1 hour
  # Looping through Dates https://stackoverflow.com/a/32611574
  days <- seq(from=as.Date("10/09/1999",format="%m/%d/%Y"), to=as.Date(Sys.Date()+1),by='days' )
  source(file.path(dirname(getwd()),"get.nrc.metadata.R"))
  nrc.document.results <- download.nrc.document.meta(days,project.wd,FALSE)
}

Wrangle the data set with:

#rm(list=ls())
if ("rstudioapi" %in% rownames(installed.packages())){
  project.wd <-file.path(dirname(rstudioapi::getActiveDocumentContext()$path),"Results")
} else { # not running from Rstudio, use a common path
  if (.Platform$OS.type == "windows") { 
    project.wd <- file.path(Sys.getenv("LOCALAPPDATA"),"nrc.documents/Results")
  } else {
    project.wd <- file.path(Sys.getenv("HOME"),"nrc.documents/Results")
  }
}
if (!dir.exists(project.wd)){
  dir.create(project.wd,recursive = TRUE)
}
setwd(project.wd)

if (!exists("nrc.document.results")){
  nrc.document.results.filename <- "nrc.document.data.frame.Rdata"
  # Only import if the results file doesn't exist
  if (file.exists(nrc.document.results.filename)){
    load(nrc.document.results.filename) 
  } else if(file.exists("FinalDownload.Rdata")) {
    load("FinalDownload.Rdata") 
  } else {
    source(file.path(dir.name(getwd()),"PracticumNrcGetAllDocuments.R"))
    load(nrc.document.results.filename)  
  }
}

# Functions
getNrcDocumentDateAsPosix <- function(date.character,time.zone = "UTC"){
  "10/03/2006 08:57 AM EDT"
  as_datetime(
    timeDate::strptimeDate(
      as.character(date.character),
      format="%m/%d/%Y",
      tz=time.zone
    )
  )
}

getNrcDateAsPosix <- function(date.character,time.zone = "UTC"){
lubridate::as_datetime(
    timeDate::strptimeDate(
      as.character(date.character),
      format="%m/%d/%Y",
      tz=time.zone
    )
  )
}

getNrcLongDateAsPosix <- function(date.character,time.zone = "UTC"){  # could use Sys.timezone() instead of "UTC"
  # Highest granularity used for this script is daily, so we can ignore the time zone...'"10/10/1999 01:44 PM EDT"'
  lubridate::as_datetime(
    timeDate::strptimeDate(
      as.character(date.character),
      format="%m/%d/%Y %H:%M",
      tz=time.zone
    )
  )
}

nrc.document.results.test <- stringr::str_to_lower(stringr::str_squish(nrc.document.results$DocumentType))
nrc.document.results$DocumentType <- stringr::str_to_lower(stringr::str_squish(nrc.document.results$DocumentType))

# Fix all invalid and missing document dates, this makes the assumption that documents with missing data 
test.nrc.docs<-subset(nrc.document.results, nchar(as.character(nrc.document.results$DocumentDate)) <= 15)

# {TODO} incorperate the cleaned and updated bad.document.dates into the dataset, for now we ignore them
# bad.document.dates <-dplyr::anti_join(nrc.document.results, test.nrc.docs, by=c("AccessionNumber", "PublishDatePARS"))
# bad.document.dates$DocumentDate <- getNrcDateAsPosix(bad.document.dates$PublishDatePARS)  
# bad.document.dates <- bad.document.dates[]
# Warning this needs additional data cleaning!!!
# nrc.document.results <- rbind(test.nrc.docs,bad.document.dates)
#rm("bad.document.dates")

nrc.document.results <- test.nrc.docs
rm("test.nrc.docs")

# Document Types are concatinated with commas, but many document types also include commas,
# so we need to replace expected commas with another string for a list of types that we need 
# to take aciton (without accidentally identifying commas that should be seperators) on See:
#   appendix F of the API guide: https://www.nrc.gov/site-help/developers/wba-api-developer-guide.pdf
options(stringsAsFactors = FALSE)
doc.type.find.replace <-readr::read_csv("doc.type.find.replace.csv")
doc.type.find.replace <- as.data.frame(
  cbind(
    doc.type.find.replace,
    stringr::str_replace_all(doc.type.find.replace,",",";")
  )
)  
colnames(doc.type.find.replace)<-c("Find","Replace")

# perform our find replace (wasn't able to pass in the vectors)
# Currently takes 2-3 seconds per loop
for(intFindReplace in seq_along(doc.type.find.replace$Find)){
  print(paste0(intFindReplace,":",Sys.time(),":Find '", doc.type.find.replace$Find[intFindReplace], "'"))
  nrc.document.results$DocumentType <-stringr::str_replace_all(
    nrc.document.results$DocumentType,
    doc.type.find.replace$Find[intFindReplace],
    doc.type.find.replace$Replace[intFindReplace]
  )
}

nrc.document.results$DocumentType <- stringr::str_trim(
  stringr::str_squish(
    tidyr::replace_na(
      nrc.document.results$DocumentType,"- None Specified"
    )
  )
)

names(nrc.document.results)
# exploring nrc.document.results
nrc.document <- nrc.document.results[
  c(
    "AccessionNumber","DocumentType",
    "EstimatedPageCount","DocumentReportNumber",
    "Keyword","LicenseNumber","PublishDatePARS",
    "DocumentDate","ContentSize"
  )
]

# forcast document type occurances per time period (weekly, monthly, quarterly, annually)
nrc.document <- 
  cbind(nrc.document,
        DocumentTypeCount = (stringr::str_count(nrc.document$DocumentType, ",")+1))

max(nrc.document$DocumentTypeCount)


# found upto 8 types applied to any individual document
nrc.document <- tidyr::separate(
  nrc.document,
  DocumentType,
  c(
    "DocumentType1","DocumentType2","DocumentType3","DocumentType4",
    "DocumentType5","DocumentType6","DocumentType7","DocumentType8","DocumentType9"
  ),sep=","
)

#rm("nrc.document.type.ls")
nrc.document.type.ls <- list(
  unique(na.omit(as.data.frame(nrc.document$DocumentType1))),
  unique(na.omit(as.data.frame(nrc.document$DocumentType2))),
  unique(na.omit(as.data.frame(nrc.document$DocumentType3))),
  unique(na.omit(as.data.frame(nrc.document$DocumentType4))),
  unique(na.omit(as.data.frame(nrc.document$DocumentType5))),
  unique(na.omit(as.data.frame(nrc.document$DocumentType6))),
  unique(na.omit(as.data.frame(nrc.document$DocumentType7))),
  unique(na.omit(as.data.frame(nrc.document$DocumentType8))),
  unique(na.omit(as.data.frame(nrc.document$DocumentType9)))
)

rm("nrc.document")
# split data into document type categories
for (intDoc in 1:length(nrc.document.type.ls)){
  colnames(nrc.document.type.ls[[intDoc]]) <- "DocumentType"
  if (intDoc == 1){
    nrc.document.type <- nrc.document.type.ls[[intDoc]]
  } else {
    nrc.document.type <- dplyr::bind_rows(nrc.document.type,nrc.document.type.ls[[intDoc]])
  }
}

rm("nrc.document.type.ls")
nrc.document.type$DocumentType <- stringr::str_trim(
  stringr::str_squish(
    tidyr::replace_na(
      stringr::str_to_lower(
        nrc.document.type$DocumentType,"Not Specified"
      )
    )
  )
)

nrc.document.type <-as.data.frame(nrc.document.type )
nrc.document.type <-as.data.frame(base::sort(unique(nrc.document.type$DocumentType)))
colnames(nrc.document.type) <- "DocumentType"
readr::write_csv(nrc.document.type,"nrc.document.types.csv")

# Categories of document types are found in Appendix F of the API guide listed above
# 10 CRF § 2.206
# https://www.nrc.gov/reading-rm/doc-collections/cfr/part002/part002-0206.html
for (intDocType in seq_along(nrc.document.type$DocumentType)){ #} 1:length(nrc.document.type[,1])){
  print(nrc.document.type$DocumentType[intDocType]) 
}

# Format date columns
nrc.document.results$DocumentDate <- getNrcDateAsPosix(nrc.document.results$DocumentDate)
nrc.document.results$PublishDatePARS <-getNrcLongDateAsPosix(nrc.document.results$PublishDatePARS)
#rm("total.count.per.day")
total.count.per.day <- table(factor(format(nrc.document.results$PublishDatePARS,"%D")))
#{TODO} Un-pipe everything forward if this is made into a package.
library(tidyverse)
days.with.errors <-
  total.count.per.day %>% as.data.frame %>% dplyr::filter(Freq == 1000)
#now for document date instead of published date:

rm("total.count.per.day")
total.count.per.day <- table(factor(nrc.document.results$DocumentDate))
#plot(total.count.per.day)
total.count.per.day <- as.data.frame(total.count.per.day)
total.count.per.day.rng <- total.count.per.day[
  lubridate::as_date(lubridate::as_date(total.count.per.day$Var1),format="%Y/%m/%d",tz = "UTC")>
    lubridate::as_date("11/01/1999",format="%m/%d/%Y",tz = "UTC"),
]

nrc.document.1999.on <- nrc.document.results[
  lubridate::as_date(lubridate::as_date(nrc.document.results$DocumentDate),format="%Y/%m/%d",tz = "UTC")>
    lubridate::as_date("11/01/1999",format="%m/%d/%Y",tz = "UTC"),
  ]

#ggplot(total.count.per.day.rng,aes(x=total.count.per.day.rng$Var1,y=total.count.per.day.rng$Freq))
#total.count.per.day.ts <- ts(total.count.per.day)

# 10 CFR parts here: https://www.nrc.gov/reading-rm/doc-collections/cfr/
#Significant Enforcement Actions:  https://www.nrc.gov/reading-rm/doc-collections/datasets/enforcement-actions.xls
issue.doc.type <- read_csv("issue.document.category.csv")

# This takes roughly 6 seconds per issue using this for loop method, there are definatly efficiencies to be made!
for (intIssue in seq_along(issue.doc.type$IssueDocType)) {
  print(paste0(intIssue,":",Sys.time(),": finding issue ",issue.doc.type[intIssue,] ))
  if (intIssue == 1) {
    nrc.document.issues <-
      nrc.document.results[grepl(as.character(issue.doc.type[intIssue,]),nrc.document.1999.on$DocumentType),]
  } else {
    nrc.document.issues <-rbind(
      nrc.document.issues,
      nrc.document.results[grepl(as.character(issue.doc.type[intIssue,]),nrc.document.1999.on$DocumentType),]
    )
  }
}

I’m using the Document Date for all analysis as it’s the date the the document was completed (written) not the date submitted to the ADAMS database.

Plotting all documents by count per day looks like this (it’s surprising as no single day in this dataset is >1000 for the Database Upload Date, but several days are > 1000 when looking at the Document origination dates.

issue.count.per.month <-as.data.frame(table(factor(format(lubridate::as_date(nrc.document.issues$DocumentDate),"%Y-%m"))))
names(issue.count.per.month)
# [TODO] sould filter rows by value not magic numbers!
#issue.count.per.month.rng <- issue.count.per.month[
#  lubridate::as_date(paste0(issue.count.per.month$Var1,"/01"),format="%Y/%m/%d",tz = "UTC")>
#    lubridate::as_date("11/01/1999",format="%m/%d/%Y",tz = "UTC"),
#  ]

issue.count.per.month.rng <- issue.count.per.month[-(1:438),]
issue.count.ts <- ts(issue.count.per.month.rng$Freq,frequency = 12, start = c(1999, 12))
plot(issue.count.ts)

Smoothing

issue.count.ts.Holt <- HoltWinters(issue.count.ts, beta = TRUE, gamma = FALSE)
Smoothing parameters:
 alpha: 0.202727
 beta : TRUE
 gamma: FALSE

Coefficients:
       [,1]
a 94.604885
b -4.504753

HoltWinters

issue.count.ts.Holt <- HoltWinters(issue.count.ts, beta = TRUE, gamma = FALSE)
issue.count.ts.Holt # 5345.801

plot(issue.count.ts.Holt, main = "NRC ADAMS Issue/Deficiency Document Count 1999 - 2019")



issue.count.Holt.mse <- issue.count.ts.Holt$SSE/length(issue.count.ts)
issue.count.Holt.mse

# Next, fit a Holt-Winter exponentially smooothed model for trend and seasonality.
issue.count.HoltWinter <- HoltWinters(issue.count.ts, beta = TRUE, gamma = TRUE)
issue.count.HoltWinter
plot(issue.count.HoltWinter, main = "NRC ADAMS Issue/Deficiency Document Count 1999 - 2019")
issue.count.HoltWinter.mse <- issue.count.HoltWinter$SSE/length(issue.count.ts)
issue.count.HoltWinter.mse # 12175.27
# Try auto.arima()
issue.count.auto <- forecast::auto.arima(issue.count.ts)
issue.count.auto
acf(issue.count.auto$residuals, col = "red", main = "Births 1946-1959 ARIMA(2,1,2)(1,1,1) Residuals ACF")
issue.count.auto.stdresid <- issue.count.auto$residuals/sqrt(issue.count.auto$sigma2)
plot(issue.count.auto.stdresid, type = "p", main = "Births ARIMA(2,1,2)(1,1,1) Std Residuals")
qqnorm(issue.count.auto.stdresid, main = "QQ Norm Plot issue.count.auto.stdresid")
qqline(issue.count.auto.stdresid)
Series: issue.count.ts 
ARIMA(1,1,1) 

Coefficients:
         ar1      ma1
      0.2199  -0.9749
s.e.  0.0682   0.0217

sigma^2 estimated as 3101:  log likelihood=-1283.8
AIC=2573.59   AICc=2573.7   BIC=2583.98
IssueDocType (regex)
abnormal occurrence repofor a rts \(see also ler & ro\)
cnwra corrective action request
cnwra qa nonconformance report
daily event report
deficiency correspondence \(per 10cfr50.55e and part 21\)
deficiency report \(per 10cfr50.55e and part 21\)
deficiency reports \(per 10cfr50.55e & part 21\)
doe corrective action request
doe ympo standard deficiency report
enforcement action
enforcement action worksheet
enforcement notification
event report from state
exercise of enforcement discretion
individual action \(enforcement\)
individual response to enforcement action
licensee 30-day written event report
licensee event report
licensee response to enforcement action
licensee response to notice of violation
non-cited violation
notice of deviation
notice of enforcement discretion \(noed\)
notice of non-conformance
notice of violation
notice of violation of a regulation
notice of violation with proposed imposition of civil penalty
npdes noncompliance notification
nrc preliminary notification of event/occurrence
nrc regulatory issue summary
oig event inquiry
part 21 correspondence
plant issues matrix
preliminary notification of event or occurrence \(pno
problem/maintenance report
safeguard incident report
security form-security incident report
security incidence report
significant event notification
significant event report
written event report \(30 or 60 day\)

Plan on hosting R Shiny Apps at https://bsetmet.com/??? using the tutorial found here.

Leave a Reply

This site uses Akismet to reduce spam. Learn how your comment data is processed.