Getting and cleaning data——Week2

2021-10-21  本文已影响0人  Chamberzero

课程github地址

Week2 内容
Content
- Reading from MySQL 
- Reading from HDF5 
- Reading from The web 
- Reading from APls 
- Reading from Other Sources 

1.Reading from MySQL

Connecting and listing databases

ucscDb <- dbConnect(MySQL(),user = "genome",
                    host="genome-mysql.soe.ucsc.edu")


result <- dbGetQuery(ucscDb,"show Databases;");dbDisconnect(ucscDb) 

Connecting to hg19 and listing tables

hg19 <- dbConnect(MySQL(),user = "genome",db="hg19",                    host="genome-mysql.soe.ucsc.edu")
allTables <- dbListTables(ucscDb,"hg19")
length(allTables)
[1] 12535
allTables[1:5]
[1] "HInv"         "HInvGeneMrna" "acembly"      "acemblyClass" "acemblyPep"  

Get dimensions of a specific table

dbListFields(hg19,"affyU133Plus2")
[1] "bin"         "matches"     "misMatches"  "repMatches"  "nCount"      "qNumInsert"  "qBaseInsert" "tNumInsert" 
 [9] "tBaseInsert" "strand"      "qName"       "qSize"       "qStart"      "qEnd"        "tName"       "tSize"      
[17] "tStart"      "tEnd"        "blockCount"  "blockSizes"  "qStarts"     "tStarts"    

dbGetQuery(hg19,"Select count(*) from affyU133Plus2")
  count(*)
1    58463

Read from the table

affyData <- dbReadTable(hg19,"affyU133Plus2")

head(affyData)
 bin matches misMatches repMatches nCount qNumInsert qBaseInsert tNumInsert tBaseInsert strand        qName qSize qStart
1 585     530          4          0     23          3          41          3         898      -  225995_x_at   637      5
2 585    3355         17          0    109          9          67          9       11621      -  225035_x_at  3635      0
3 585    4156         14          0     83         16          18          2          93      -  226340_x_at  4318      3
4 585    4667          9          0     68         21          42          3        5743      - 1557034_s_at  4834     48
5 585    5180         14          0    167         10          38          1          29      -    231811_at  5399      0
6 585     468          5          0     14          0           0          0           0      -    236841_at   487      0

Select a specific subset

2.Reading from HDF5

HDF官网

Create group

biocManager::install("rhdf5")
library(rhdf5)
---
create <- h5createFile("example.h5")
create <- h5createGroup("example.h5","foo")
created <- h5createGroup("example.h5","foo/foobaa").
h5ls("example.h5")

Write to groups

A = matrix(1:10,nr=5,nc=2)
h5write(A, "example.h5","foo/A")
B =array(seq(0.1,2.0,by=0.1),dim=c(5,2,2))
attr(B,"scale")<-"liter"
h5write(B, "example.h5","foo/foobaa/B")
h5ls("example.h5")

Write a data set

df = data.frame(1L:5L,seq(0,1,length.out=5),c("ab","cde","fghi","a", "s"), stringsAsFactors=FALSE)
h5write(df, "example.h5","df")
h5ls("example.h5")

Reading data

readA = h5read("example.h5","foo/A")
readB = h5read("example.h5","foo/foobaa/B")
readdf= h5read("example.h5","df")
readA

Writing and reading chunks

h5write(c(12, 13, 14),"example.h5","foo/A",index=list(1: 3, 1))
h5read ("example.h5","foo/A")

3.Reading from The web

Getting data off webpages - readLines()

Parsing with XML

library(XML)
url<- "http://scholar.googlecom/citations?user=hi-i6coaaaaj&hl=en"
html <-htmlTreeParse(url, useInternalNodes=T)
pathsApply(html,"//title", xmlValue)
xpathSApply(html,"//td[@id='col-citedby']", xmlValue)

GET from the httr package

library(httr); html2= GET(url)
content2 = content(html2, as="text")
parsedHtml= htmlParse(content2, asText=TRUE)
xpathSApply (parsedHtml ,"//title", xmlValue)

Accessing websites with passwords

pgl = GET("http://httpbin.org/basic-auth/user/passwd",authenticate( user,"passwd"))
pgl

4.Reading from APls


5.Reading from Other Sources

上一篇 下一篇

猜你喜欢

热点阅读