SEQUELFun - Play SQL with R (dplyr)

0.1 Import an Excel file with multiple sheets to R local data frames
0.2 Create a SQLite database with tables from R local data frames
0.3 Read back tables from a newly-created database
0.4 Scenario 1
0.5 Scenario 2
0.6 Scenario 3

0.1 Import an Excel file with multiple sheets to R local data frames

for (i in 1:length(getSheets(wb))){
  assign(getSheets(wb)[i], 
         tbl_df(read.xls('./RData/sampleA.xls', 
                         sheet = getSheets(wb)[i],
                         stringsAsFactors=F, 
                         as.is=T)))
}

0.2 Create a SQLite database with tables from R local data frames

sampleA_db <- src_sqlite("./RData/sampleA.sqlite3", create = TRUE)

copy_to(sampleA_db, as.data.frame(dimSchool), name = "dimSchool", temporary = FALSE)

## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: dimSchool [42 x 16]
## 
##    SchoolKEY AspireSchoolID CustomID CommonName Abbreviation Latitude
## 1        393             18  6118178        UCS          UCS 37.68896
## 2        454             18  6118178        UCS          UCS 37.68896
## 3        491             18  6118178        UCS          UCS 37.68896
## 4        525              2   101956   Ben Holt          BHA 38.03712
## 5        531             18  6118178        UCS          UCS 37.68896
## 6        565              2   101956   Ben Holt          BHA 38.03712
## 7        572             18  6118178        UCS          UCS 37.68896
## 8        606              2   101956   Ben Holt          BHA 38.03712
## 9        608              4   108647 Rosa Parks          RPA 37.93878
## 10       616             18  6118178        UCS          UCS 37.68896
## ..       ...            ...      ...        ...          ...      ...
## Variables not shown: Longitude (dbl), YearOpened (int), YearsInService
##   (int), LowGrade (chr), HighGrade (chr), IsActive (chr), RowIsCurrent
##   (chr), RowStartDate (chr), RowEndDate (chr), X (int)

copy_to(sampleA_db, as.data.frame(dimStudent), name = "dimStudent", temporary = FALSE)

## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: dimStudent [470 x 7]
## 
##    StudentKEY StudentNumber ScrambledName Class RowIsCurrent
## 1      395669        301483 Katyi Aainord  2016            N
## 2      385920        301516 Natki Maiaors  2016            N
## 3      386318        301537 Satmr Aaidors  2016            N
## 4      427509        301537 Satmr Aaidors  2016            N
## 5      439149        301537 Satmr Aaidors  2016            N
## 6      389177        375200 Eatlt Hainor.  2019            N
## 7      442372        375200 Eatlt Hainor.  2019            N
## 8      430156        375200 Eatlt Hainor.  2019            N
## 9      420749        375200 Eatlt Hainor.  2019            N
## 10     369121        375200 Eatlt Hainor.  2019            N
## ..        ...           ...           ...   ...          ...
## Variables not shown: RowStartDate (chr), RowEndDate (chr)

copy_to(sampleA_db, as.data.frame(dimTest), name = "dimTest", temporary = FALSE)

## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: dimTest [14 x 7]
## 
##    TestKEY       TestID  TestType                 TestName
## 1    38582 1.788805e+13 Benchmark Reading Winter Benchmark
## 2    38585 1.788805e+13 Benchmark    Math Winter Benchmark
## 3    38588 1.788805e+13 Benchmark Writing Winter Benchmark
## 4    38589 8.880515e+11 Benchmark Writing Winter Benchmark
## 5    38590 1.888805e+13 Benchmark     ELA Winter Benchmark
## 6    38599 8.880515e+11 Benchmark Reading Winter Benchmark
## 7    38607 8.880532e+11 Benchmark    Math Winter Benchmark
## 8    38643 1.688805e+13   Interim          Math Interim #2
## 9    38652 8.880452e+11   Interim          Math Interim #2
## 10   38669 1.688803e+13   Interim       Biology Interim #2
## 11   38671 8.880285e+11   Interim       Biology Interim #2
## 12   38683 8.880666e+11 Benchmark    Math Winter Benchmark
## 13   38684 1.788807e+13 Benchmark    Math Winter Benchmark
## 14   38935 2.102002e+13       SRI               Winter SRI
## Variables not shown: TestNameDetail (chr), IsTotalScore (chr), IsSubScore
##   (chr)

copy_to(sampleA_db, as.data.frame(dimTestQuestion), name = "dimTestQuestion", temporary = FALSE)

## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: dimTestQuestion [439 x 8]
## 
##    TestQuestionKEY       TestID QuestionNumber
## 1           842598 888053223964             28
## 2           842602 888051491207              2
## 3           842606 888051491207              3
## 4           842612 888053223964             19
## 5           842619 888051491207             27
## 6           842638 888051491207             17
## 7           842659 888053223964              2
## 8           842667 888051491207             34
## 9           842670 888051523209              1
## 10          842673 888053223964             24
## ..             ...          ...            ...
## Variables not shown: QuestionDescription (chr), Answer (chr),
##   AnswerDescription (chr), IsCorrectAnswer (chr), DateCreated (chr)

copy_to(sampleA_db, as.data.frame(factTestScores), name = "factTestScores", temporary = FALSE)

## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: factTestScores [812 x 23]
## 
##    factTestScoresKEY TestKEY TestStandardKEY TestQuestionKEY StudentKEY
## 1           10717056   38671            2596          846896     389437
## 2           10717057   38671            2609          847965     389437
## 3           10717058   38671            2602          847954     389380
## 4           10717086   38671            2593          847135     389437
## 5           10717087   38671            2593          847587     389380
## 6           10717213   38671            2593          848010     389437
## 7           10717214   38671            2609          847235     389437
## 8           10717215   38671            2609          847979     389380
## 9           10717216   38671            2605          847917     389437
## 10          10717217   38671            2597          848040     389437
## ..               ...     ...             ...             ...        ...
## Variables not shown: DateKEY (int), SchoolKEY (int), CalendarKEY (int),
##   AltKEY (int), PreviousKEY (int), PLKEY (int), PointsEarned (int),
##   PointsPossible (int), Student2KEY (int), GrowthKEY (int), Prev (chr),
##   Year1 (chr), Year2 (chr), AltName (chr), SourceName (chr), OfficialKEY
##   (int), LatestKEY (int), IntKEY (int)

0.3 Read back tables from a newly-created database

# Readback the newly-created database
sampleA_db = src_sqlite("./RData/sampleA.sqlite3")

factTestScores = sampleA_db %>% tbl("factTestScores")
dimSchool = sampleA_db %>% tbl("dimSchool")
dimStudent = sampleA_db %>% tbl("dimStudent")
dimTest = sampleA_db %>% tbl("dimTest")
dimTestQuestion = sampleA_db %>% tbl("dimTestQuestion")

0.4 Scenario 1

# Direct access tables via tbl and SQL queries
sampleA_db %>%
  tbl(sql(
    "
    SELECT factTestScoresKEY, PointsEarned, PointsPossible
    FROM factTestScores 
    JOIN dimSchool USING (SchoolKEY)
    JOIN dimTest USING (TestKEY)
    WHERE CommonName='Rosa Parks' AND
    TestName='Winter SRI'
    "
  ))

## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: <derived table> [?? x 3]
## 
## Variables not shown: factTestScoresKEY (int), PointsEarned (int),
##   PointsPossible (int)

# Access tables like a R data frame with dplyr
factTestScores %>% 
  left_join(dimSchool, by = 'SchoolKEY') %>%
  left_join(dimTest, by = 'TestKEY') %>%
  filter(CommonName=='Rosa Parks', TestName=='Winter SRI') %>%
  select(factTestScoresKEY, PointsEarned, PointsPossible) %>%
  as.data.frame() %>% # make it local data frame
  tbl_df() %>% # convert it to tbl
  print() # can see dimensions

## Source: local data frame [0 x 3]
## 
## Variables not shown: factTestScoresKEY (int), PointsEarned (int),
##   PointsPossible (int)

0.5 Scenario 2

# Direct access tables via tbl and SQL queries
sampleA_db %>%
  tbl(sql(
    "
    SELECT TestQuestionKEY, AVG(PointsEarned)
    FROM factTestScores
    JOIN dimTest USING (TestKEY)
    JOIN dimTestQuestion USING (TestQuestionKEY)
    WHERE TestName='Biology Interim #2'
    GROUP BY TestQuestionKEY
    "
  ))

## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: <derived table> [?? x 2]
## 
##    TestQuestionKEY AVG(PointsEarned)
## 1           846842                 1
## 2           846846                 0
## 3           846851                 1
## 4           846869                 0
## 5           846874                 0
## 6           846893                 0
## 7           846896                 1
## 8           846914                 1
## 9           846916                 0
## 10          846931                 0
## ..             ...               ...

# Access tables like a R data frame with dplyr
factTestScores %>%
  inner_join(dimTest, by = 'TestKEY') %>%
  filter(TestName=='Biology Interim #2') %>%  
  group_by(TestQuestionKEY) %>%
  summarise(avgs=mean(PointsEarned)) %>%
  as.data.frame() %>%
  tbl_df() %>%
  print()

## Source: local data frame [109 x 2]
## 
##    TestQuestionKEY avgs
## 1           846842    1
## 2           846846    0
## 3           846851    1
## 4           846869    0
## 5           846874    0
## 6           846893    0
## 7           846896    1
## 8           846914    1
## 9           846916    0
## 10          846931    0
## ..             ...  ...

0.6 Scenario 3

# use dplyr like SQL queries
# Check on primary key
dimTestQuestion %>%
  select(TestQuestionKEY) %>% # primary key
  summarise(n())

## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: <derived table> [?? x 1]
## 
##    n()
## 1  439
## .. ...

dimTestQuestion %>%
  select(TestQuestionKEY) %>%
  distinct() %>%
  summarise(n())

## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: <derived table> [?? x 1]
## 
##    n()
## 1  439
## .. ...

# Check on non-primary key
dimTestQuestion %>%
  select(Answer) %>%
  summarise(n())

## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: <derived table> [?? x 1]
## 
##    n()
## 1  439
## .. ...

dimTestQuestion %>%
  select(Answer) %>%
  distinct() %>%
  summarise(n())

## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: <derived table> [?? x 1]
## 
##    n()
## 1    6
## .. ...

# Check on fixed-length field
dimTestQuestion %>%
  data.frame() %>%
  select(TestQuestionKEY, TestID) %>%
  filter(nchar(TestID)!=12) %>%
  format(digits=15)

##    TestQuestionKEY         TestID
## 1           842801 17888051523209
## 2           842983 17888051491207
## 3           843001 17888053223964
## 4           843036 17888051491207
## 5           843392 17888053223964
## 6           843635 17888051491207
## 7           844037 18888051523209
## 8           844481 17888053223964
## 9           845353 16888045183762
## 10          845356 16888045183762
## 11          847096 16888028458048
## 12          847463 16888028458048
## 13          848229 16888028458048
## 14          848686 17888066558754
## 15          849594 21020020110106

# Check on positive field
dimTestQuestion %>%
  data.frame() %>%
  select(TestQuestionKEY, QuestionNumber) %>%
  filter(QuestionNumber <= 0)

##    TestQuestionKEY QuestionNumber
## 1           842801             -1
## 2           842983             -1
## 3           843001             -1
## 4           843036             -1
## 5           843392             -1
## 6           843635             -1
## 7           844037             -1
## 8           844481             -1
## 9           845353             -1
## 10          845356             -1
## 11          847096             -1
## 12          847463             -1
## 13          848229             -1
## 14          848686             -1
## 15          849594             -1

# Check on Excel errors and empty cells in a field
dimTestQuestion %>%
  data.frame() %>%
  select(TestQuestionKEY, QuestionNumber) %>%
  filter(QuestionNumber == '')

## [1] TestQuestionKEY QuestionNumber 
## <0 rows> (or 0-length row.names)

dimTestQuestion %>%
  data.frame() %>%
  select(TestQuestionKEY, AnswerDescription) %>%
  filter(AnswerDescription == '#NAME?')

##   TestQuestionKEY AnswerDescription
## 1          842619            #NAME?
## 2          843324            #NAME?
## 3          843393            #NAME?
## 4          843559            #NAME?
## 5          843755            #NAME?
## 6          844414            #NAME?

# Using regular expression
dimTestQuestion %>%
  data.frame() %>%
  select(TestQuestionKEY, AnswerDescription) %>%
  filter(str_detect(AnswerDescription, '^#.+?$'))

##   TestQuestionKEY AnswerDescription
## 1          842619            #NAME?
## 2          843324            #NAME?
## 3          843393            #NAME?
## 4          843559            #NAME?
## 5          843755            #NAME?
## 6          844414            #NAME?

# Check on fixed-value field
dimTestQuestion %>%
  data.frame() %>%
  select(TestQuestionKEY, Answer) %>%
  filter(!(Answer %in% c('A','B','C','D','NR')))

##    TestQuestionKEY Answer
## 1           842801  -----
## 2           842983  -----
## 3           843001  -----
## 4           843036  -----
## 5           843392  -----
## 6           843635  -----
## 7           844037  -----
## 8           844481  -----
## 9           845353  -----
## 10          845356  -----
## 11          847096  -----
## 12          847463  -----
## 13          848229  -----
## 14          848686  -----
## 15          849594  -----

# Using regular expression
dimTestQuestion %>%
  data.frame() %>%
  select(TestQuestionKEY, Answer) %>%
  filter(str_detect(Answer, '[^[:alnum:]]'))

##    TestQuestionKEY Answer
## 1           842801  -----
## 2           842983  -----
## 3           843001  -----
## 4           843036  -----
## 5           843392  -----
## 6           843635  -----
## 7           844037  -----
## 8           844481  -----
## 9           845353  -----
## 10          845356  -----
## 11          847096  -----
## 12          847463  -----
## 13          848229  -----
## 14          848686  -----
## 15          849594  -----

# Using regular expression
dimTestQuestion %>%
  data.frame() %>%
  select(TestQuestionKEY, Answer) %>%
  filter(str_detect(Answer, '[^ABCDNR]'))

##    TestQuestionKEY Answer
## 1           842801  -----
## 2           842983  -----
## 3           843001  -----
## 4           843036  -----
## 5           843392  -----
## 6           843635  -----
## 7           844037  -----
## 8           844481  -----
## 9           845353  -----
## 10          845356  -----
## 11          847096  -----
## 12          847463  -----
## 13          848229  -----
## 14          848686  -----
## 15          849594  -----

dimTestQuestion %>%
  data.frame() %>%
  select(TestQuestionKEY, IsCorrectAnswer) %>%
  filter(!(IsCorrectAnswer %in% c('Y','N')))

##    TestQuestionKEY IsCorrectAnswer
## 1           842801               -
## 2           842983               -
## 3           843001               -
## 4           843036               -
## 5           843392               -
## 6           843635               -
## 7           844037               -
## 8           844481               -
## 9           845353               -
## 10          845356               -
## 11          847096               -
## 12          847463               -
## 13          848229               -
## 14          848686               -
## 15          849594               -

# Using regular expression
dimTestQuestion %>%
  data.frame() %>%
  select(TestQuestionKEY, IsCorrectAnswer) %>%
  filter(str_detect(IsCorrectAnswer, '[^YN]'))

##    TestQuestionKEY IsCorrectAnswer
## 1           842801               -
## 2           842983               -
## 3           843001               -
## 4           843036               -
## 5           843392               -
## 6           843635               -
## 7           844037               -
## 8           844481               -
## 9           845353               -
## 10          845356               -
## 11          847096               -
## 12          847463               -
## 13          848229               -
## 14          848686               -
## 15          849594               -