Import an Excel file with multiple sheets to R local data frames
for (i in 1:length(getSheets(wb))){
assign(getSheets(wb)[i],
tbl_df(read.xls('./RData/sampleA.xls',
sheet = getSheets(wb)[i],
stringsAsFactors=F,
as.is=T)))
}
Create a SQLite database with tables from R local data frames
sampleA_db <- src_sqlite("./RData/sampleA.sqlite3", create = TRUE)
copy_to(sampleA_db, as.data.frame(dimSchool), name = "dimSchool", temporary = FALSE)
## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: dimSchool [42 x 16]
##
## SchoolKEY AspireSchoolID CustomID CommonName Abbreviation Latitude
## 1 393 18 6118178 UCS UCS 37.68896
## 2 454 18 6118178 UCS UCS 37.68896
## 3 491 18 6118178 UCS UCS 37.68896
## 4 525 2 101956 Ben Holt BHA 38.03712
## 5 531 18 6118178 UCS UCS 37.68896
## 6 565 2 101956 Ben Holt BHA 38.03712
## 7 572 18 6118178 UCS UCS 37.68896
## 8 606 2 101956 Ben Holt BHA 38.03712
## 9 608 4 108647 Rosa Parks RPA 37.93878
## 10 616 18 6118178 UCS UCS 37.68896
## .. ... ... ... ... ... ...
## Variables not shown: Longitude (dbl), YearOpened (int), YearsInService
## (int), LowGrade (chr), HighGrade (chr), IsActive (chr), RowIsCurrent
## (chr), RowStartDate (chr), RowEndDate (chr), X (int)
copy_to(sampleA_db, as.data.frame(dimStudent), name = "dimStudent", temporary = FALSE)
## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: dimStudent [470 x 7]
##
## StudentKEY StudentNumber ScrambledName Class RowIsCurrent
## 1 395669 301483 Katyi Aainord 2016 N
## 2 385920 301516 Natki Maiaors 2016 N
## 3 386318 301537 Satmr Aaidors 2016 N
## 4 427509 301537 Satmr Aaidors 2016 N
## 5 439149 301537 Satmr Aaidors 2016 N
## 6 389177 375200 Eatlt Hainor. 2019 N
## 7 442372 375200 Eatlt Hainor. 2019 N
## 8 430156 375200 Eatlt Hainor. 2019 N
## 9 420749 375200 Eatlt Hainor. 2019 N
## 10 369121 375200 Eatlt Hainor. 2019 N
## .. ... ... ... ... ...
## Variables not shown: RowStartDate (chr), RowEndDate (chr)
copy_to(sampleA_db, as.data.frame(dimTest), name = "dimTest", temporary = FALSE)
## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: dimTest [14 x 7]
##
## TestKEY TestID TestType TestName
## 1 38582 1.788805e+13 Benchmark Reading Winter Benchmark
## 2 38585 1.788805e+13 Benchmark Math Winter Benchmark
## 3 38588 1.788805e+13 Benchmark Writing Winter Benchmark
## 4 38589 8.880515e+11 Benchmark Writing Winter Benchmark
## 5 38590 1.888805e+13 Benchmark ELA Winter Benchmark
## 6 38599 8.880515e+11 Benchmark Reading Winter Benchmark
## 7 38607 8.880532e+11 Benchmark Math Winter Benchmark
## 8 38643 1.688805e+13 Interim Math Interim #2
## 9 38652 8.880452e+11 Interim Math Interim #2
## 10 38669 1.688803e+13 Interim Biology Interim #2
## 11 38671 8.880285e+11 Interim Biology Interim #2
## 12 38683 8.880666e+11 Benchmark Math Winter Benchmark
## 13 38684 1.788807e+13 Benchmark Math Winter Benchmark
## 14 38935 2.102002e+13 SRI Winter SRI
## Variables not shown: TestNameDetail (chr), IsTotalScore (chr), IsSubScore
## (chr)
copy_to(sampleA_db, as.data.frame(dimTestQuestion), name = "dimTestQuestion", temporary = FALSE)
## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: dimTestQuestion [439 x 8]
##
## TestQuestionKEY TestID QuestionNumber
## 1 842598 888053223964 28
## 2 842602 888051491207 2
## 3 842606 888051491207 3
## 4 842612 888053223964 19
## 5 842619 888051491207 27
## 6 842638 888051491207 17
## 7 842659 888053223964 2
## 8 842667 888051491207 34
## 9 842670 888051523209 1
## 10 842673 888053223964 24
## .. ... ... ...
## Variables not shown: QuestionDescription (chr), Answer (chr),
## AnswerDescription (chr), IsCorrectAnswer (chr), DateCreated (chr)
copy_to(sampleA_db, as.data.frame(factTestScores), name = "factTestScores", temporary = FALSE)
## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: factTestScores [812 x 23]
##
## factTestScoresKEY TestKEY TestStandardKEY TestQuestionKEY StudentKEY
## 1 10717056 38671 2596 846896 389437
## 2 10717057 38671 2609 847965 389437
## 3 10717058 38671 2602 847954 389380
## 4 10717086 38671 2593 847135 389437
## 5 10717087 38671 2593 847587 389380
## 6 10717213 38671 2593 848010 389437
## 7 10717214 38671 2609 847235 389437
## 8 10717215 38671 2609 847979 389380
## 9 10717216 38671 2605 847917 389437
## 10 10717217 38671 2597 848040 389437
## .. ... ... ... ... ...
## Variables not shown: DateKEY (int), SchoolKEY (int), CalendarKEY (int),
## AltKEY (int), PreviousKEY (int), PLKEY (int), PointsEarned (int),
## PointsPossible (int), Student2KEY (int), GrowthKEY (int), Prev (chr),
## Year1 (chr), Year2 (chr), AltName (chr), SourceName (chr), OfficialKEY
## (int), LatestKEY (int), IntKEY (int)
Read back tables from a newly-created database
# Readback the newly-created database
sampleA_db = src_sqlite("./RData/sampleA.sqlite3")
factTestScores = sampleA_db %>% tbl("factTestScores")
dimSchool = sampleA_db %>% tbl("dimSchool")
dimStudent = sampleA_db %>% tbl("dimStudent")
dimTest = sampleA_db %>% tbl("dimTest")
dimTestQuestion = sampleA_db %>% tbl("dimTestQuestion")
Scenario 1
# Direct access tables via tbl and SQL queries
sampleA_db %>%
tbl(sql(
"
SELECT factTestScoresKEY, PointsEarned, PointsPossible
FROM factTestScores
JOIN dimSchool USING (SchoolKEY)
JOIN dimTest USING (TestKEY)
WHERE CommonName='Rosa Parks' AND
TestName='Winter SRI'
"
))
## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: <derived table> [?? x 3]
##
## Variables not shown: factTestScoresKEY (int), PointsEarned (int),
## PointsPossible (int)
# Access tables like a R data frame with dplyr
factTestScores %>%
left_join(dimSchool, by = 'SchoolKEY') %>%
left_join(dimTest, by = 'TestKEY') %>%
filter(CommonName=='Rosa Parks', TestName=='Winter SRI') %>%
select(factTestScoresKEY, PointsEarned, PointsPossible) %>%
as.data.frame() %>% # make it local data frame
tbl_df() %>% # convert it to tbl
print() # can see dimensions
## Source: local data frame [0 x 3]
##
## Variables not shown: factTestScoresKEY (int), PointsEarned (int),
## PointsPossible (int)
Scenario 2
# Direct access tables via tbl and SQL queries
sampleA_db %>%
tbl(sql(
"
SELECT TestQuestionKEY, AVG(PointsEarned)
FROM factTestScores
JOIN dimTest USING (TestKEY)
JOIN dimTestQuestion USING (TestQuestionKEY)
WHERE TestName='Biology Interim #2'
GROUP BY TestQuestionKEY
"
))
## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: <derived table> [?? x 2]
##
## TestQuestionKEY AVG(PointsEarned)
## 1 846842 1
## 2 846846 0
## 3 846851 1
## 4 846869 0
## 5 846874 0
## 6 846893 0
## 7 846896 1
## 8 846914 1
## 9 846916 0
## 10 846931 0
## .. ... ...
# Access tables like a R data frame with dplyr
factTestScores %>%
inner_join(dimTest, by = 'TestKEY') %>%
filter(TestName=='Biology Interim #2') %>%
group_by(TestQuestionKEY) %>%
summarise(avgs=mean(PointsEarned)) %>%
as.data.frame() %>%
tbl_df() %>%
print()
## Source: local data frame [109 x 2]
##
## TestQuestionKEY avgs
## 1 846842 1
## 2 846846 0
## 3 846851 1
## 4 846869 0
## 5 846874 0
## 6 846893 0
## 7 846896 1
## 8 846914 1
## 9 846916 0
## 10 846931 0
## .. ... ...
Scenario 3
# use dplyr like SQL queries
# Check on primary key
dimTestQuestion %>%
select(TestQuestionKEY) %>% # primary key
summarise(n())
## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: <derived table> [?? x 1]
##
## n()
## 1 439
## .. ...
dimTestQuestion %>%
select(TestQuestionKEY) %>%
distinct() %>%
summarise(n())
## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: <derived table> [?? x 1]
##
## n()
## 1 439
## .. ...
# Check on non-primary key
dimTestQuestion %>%
select(Answer) %>%
summarise(n())
## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: <derived table> [?? x 1]
##
## n()
## 1 439
## .. ...
dimTestQuestion %>%
select(Answer) %>%
distinct() %>%
summarise(n())
## Source: sqlite 3.8.6 [./RData/sampleA.sqlite3]
## From: <derived table> [?? x 1]
##
## n()
## 1 6
## .. ...
# Check on fixed-length field
dimTestQuestion %>%
data.frame() %>%
select(TestQuestionKEY, TestID) %>%
filter(nchar(TestID)!=12) %>%
format(digits=15)
## TestQuestionKEY TestID
## 1 842801 17888051523209
## 2 842983 17888051491207
## 3 843001 17888053223964
## 4 843036 17888051491207
## 5 843392 17888053223964
## 6 843635 17888051491207
## 7 844037 18888051523209
## 8 844481 17888053223964
## 9 845353 16888045183762
## 10 845356 16888045183762
## 11 847096 16888028458048
## 12 847463 16888028458048
## 13 848229 16888028458048
## 14 848686 17888066558754
## 15 849594 21020020110106
# Check on positive field
dimTestQuestion %>%
data.frame() %>%
select(TestQuestionKEY, QuestionNumber) %>%
filter(QuestionNumber <= 0)
## TestQuestionKEY QuestionNumber
## 1 842801 -1
## 2 842983 -1
## 3 843001 -1
## 4 843036 -1
## 5 843392 -1
## 6 843635 -1
## 7 844037 -1
## 8 844481 -1
## 9 845353 -1
## 10 845356 -1
## 11 847096 -1
## 12 847463 -1
## 13 848229 -1
## 14 848686 -1
## 15 849594 -1
# Check on Excel errors and empty cells in a field
dimTestQuestion %>%
data.frame() %>%
select(TestQuestionKEY, QuestionNumber) %>%
filter(QuestionNumber == '')
## [1] TestQuestionKEY QuestionNumber
## <0 rows> (or 0-length row.names)
dimTestQuestion %>%
data.frame() %>%
select(TestQuestionKEY, AnswerDescription) %>%
filter(AnswerDescription == '#NAME?')
## TestQuestionKEY AnswerDescription
## 1 842619 #NAME?
## 2 843324 #NAME?
## 3 843393 #NAME?
## 4 843559 #NAME?
## 5 843755 #NAME?
## 6 844414 #NAME?
# Using regular expression
dimTestQuestion %>%
data.frame() %>%
select(TestQuestionKEY, AnswerDescription) %>%
filter(str_detect(AnswerDescription, '^#.+?$'))
## TestQuestionKEY AnswerDescription
## 1 842619 #NAME?
## 2 843324 #NAME?
## 3 843393 #NAME?
## 4 843559 #NAME?
## 5 843755 #NAME?
## 6 844414 #NAME?
# Check on fixed-value field
dimTestQuestion %>%
data.frame() %>%
select(TestQuestionKEY, Answer) %>%
filter(!(Answer %in% c('A','B','C','D','NR')))
## TestQuestionKEY Answer
## 1 842801 -----
## 2 842983 -----
## 3 843001 -----
## 4 843036 -----
## 5 843392 -----
## 6 843635 -----
## 7 844037 -----
## 8 844481 -----
## 9 845353 -----
## 10 845356 -----
## 11 847096 -----
## 12 847463 -----
## 13 848229 -----
## 14 848686 -----
## 15 849594 -----
# Using regular expression
dimTestQuestion %>%
data.frame() %>%
select(TestQuestionKEY, Answer) %>%
filter(str_detect(Answer, '[^[:alnum:]]'))
## TestQuestionKEY Answer
## 1 842801 -----
## 2 842983 -----
## 3 843001 -----
## 4 843036 -----
## 5 843392 -----
## 6 843635 -----
## 7 844037 -----
## 8 844481 -----
## 9 845353 -----
## 10 845356 -----
## 11 847096 -----
## 12 847463 -----
## 13 848229 -----
## 14 848686 -----
## 15 849594 -----
# Using regular expression
dimTestQuestion %>%
data.frame() %>%
select(TestQuestionKEY, Answer) %>%
filter(str_detect(Answer, '[^ABCDNR]'))
## TestQuestionKEY Answer
## 1 842801 -----
## 2 842983 -----
## 3 843001 -----
## 4 843036 -----
## 5 843392 -----
## 6 843635 -----
## 7 844037 -----
## 8 844481 -----
## 9 845353 -----
## 10 845356 -----
## 11 847096 -----
## 12 847463 -----
## 13 848229 -----
## 14 848686 -----
## 15 849594 -----
dimTestQuestion %>%
data.frame() %>%
select(TestQuestionKEY, IsCorrectAnswer) %>%
filter(!(IsCorrectAnswer %in% c('Y','N')))
## TestQuestionKEY IsCorrectAnswer
## 1 842801 -
## 2 842983 -
## 3 843001 -
## 4 843036 -
## 5 843392 -
## 6 843635 -
## 7 844037 -
## 8 844481 -
## 9 845353 -
## 10 845356 -
## 11 847096 -
## 12 847463 -
## 13 848229 -
## 14 848686 -
## 15 849594 -
# Using regular expression
dimTestQuestion %>%
data.frame() %>%
select(TestQuestionKEY, IsCorrectAnswer) %>%
filter(str_detect(IsCorrectAnswer, '[^YN]'))
## TestQuestionKEY IsCorrectAnswer
## 1 842801 -
## 2 842983 -
## 3 843001 -
## 4 843036 -
## 5 843392 -
## 6 843635 -
## 7 844037 -
## 8 844481 -
## 9 845353 -
## 10 845356 -
## 11 847096 -
## 12 847463 -
## 13 848229 -
## 14 848686 -
## 15 849594 -