5 min read

How to download sumo (or any other) data from data.world

data.world is an online catalogue for data and analysis.

You’ll need to install data.world package:

install.packages("data.world")

Create a data.world account, find your API token in advanced settings and save it as a system variable:

Sys.setenv(DW_API_TOKEN = "...")

You’re all set:

library(data.world)
set_config(cfg_env(auth_token_var = "DW_API_TOKEN"))

If you’re not familiar with tidyverse, it’s never too late. I’m a big fan of pipes.

library(tidyverse)

This code downloads all files (about 35 MBytes at the time of writing) from my sumo dataset to the working directory:

"cervus/sumo-japan" %>% 
    # get meta data
    get_dataset() %>% 
    # extract file names
    .$files %>% 
    map("name") %>% 
    # download each file
    lapply(
        function(fn) download_file(
            dataset = "cervus/sumo-japan",
            file_name = fn,
            output = fn
        )
    )

You’ve got three CSV files:

list.files(pattern = "\\.csv")
## [1] "banzuke.csv" "odds.csv"    "results.csv"

Banzuke – rankings published before each tournament – taken from Sumo Reference:

"banzuke.csv" %>% 
    read_csv() %>% 
    str()
## Parsed with column specification:
## cols(
##   basho = col_double(),
##   id = col_double(),
##   rank = col_character(),
##   rikishi = col_character(),
##   heya = col_character(),
##   shusshin = col_character(),
##   birth_date = col_date(format = ""),
##   height = col_double(),
##   weight = col_double(),
##   prev = col_character(),
##   prev_w = col_double(),
##   prev_l = col_double()
## )
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 163082 obs. of  12 variables:
##  $ basho     : num  1983 1983 1983 1983 1983 ...
##  $ id        : num  1354 4080 4095 4104 4112 ...
##  $ rank      : chr  "Y1e" "Y1w" "Y2eHD" "O1e" ...
##  $ rikishi   : chr  "Chiyonofuji" "Kitanoumi" "Wakanohana" "Takanosato" ...
##  $ heya      : chr  "Kokonoe" "Mihogaseki" "Futagoyama" "Futagoyama" ...
##  $ shusshin  : chr  "Hokkaido" "Hokkaido" "Aomori" "Aomori" ...
##  $ birth_date: Date, format: "1955-06-01" "1953-05-16" ...
##  $ height    : num  182 179 186 181 183 ...
##  $ weight    : num  116 165 133 144 163 121 138 181 124 156 ...
##  $ prev      : chr  "Y1e" "Y2eHD" "Y1w" "O1e" ...
##  $ prev_w    : num  14 9 0 10 10 12 8 9 9 11 ...
##  $ prev_l    : num  1 3 0 5 5 3 7 6 6 4 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   basho = col_double(),
##   ..   id = col_double(),
##   ..   rank = col_character(),
##   ..   rikishi = col_character(),
##   ..   heya = col_character(),
##   ..   shusshin = col_character(),
##   ..   birth_date = col_date(format = ""),
##   ..   height = col_double(),
##   ..   weight = col_double(),
##   ..   prev = col_character(),
##   ..   prev_w = col_double(),
##   ..   prev_l = col_double()
##   .. )

Results (top two divisions, at the moment) – also from Sumo Reference:

"results.csv" %>% 
    read_csv() %>% 
    str()
## Parsed with column specification:
## cols(
##   basho = col_double(),
##   day = col_double(),
##   rikishi1_id = col_double(),
##   rikishi1_rank = col_character(),
##   rikishi1_shikona = col_character(),
##   rikishi1_result = col_character(),
##   rikishi1_win = col_double(),
##   kimarite = col_character(),
##   rikishi2_id = col_double(),
##   rikishi2_rank = col_character(),
##   rikishi2_shikona = col_character(),
##   rikishi2_result = col_character(),
##   rikishi2_win = col_double()
## )
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 214788 obs. of  13 variables:
##  $ basho           : num  1983 1983 1983 1983 1983 ...
##  $ day             : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ rikishi1_id     : num  4140 4306 1337 4323 4097 ...
##  $ rikishi1_rank   : chr  "J13w" "Ms1e" "J12w" "J13e" ...
##  $ rikishi1_shikona: chr  "Chikubayama" "Ofuji" "Tochitsukasa" "Shiraiwa" ...
##  $ rikishi1_result : chr  "0-1 (7-8)" "1-0 (6-1)" "1-0 (9-6)" "0-1 (3-12)" ...
##  $ rikishi1_win    : num  0 1 1 0 0 1 0 1 0 1 ...
##  $ kimarite        : chr  "yorikiri" "yorikiri" "oshidashi" "oshidashi" ...
##  $ rikishi2_id     : num  4306 4140 4323 1337 4319 ...
##  $ rikishi2_rank   : chr  "Ms1e" "J13w" "J13e" "J12w" ...
##  $ rikishi2_shikona: chr  "Ofuji" "Chikubayama" "Shiraiwa" "Tochitsukasa" ...
##  $ rikishi2_result : chr  "1-0 (6-1)" "0-1 (7-8)" "0-1 (3-12)" "1-0 (9-6)" ...
##  $ rikishi2_win    : num  1 0 0 1 1 0 1 0 1 0 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   basho = col_double(),
##   ..   day = col_double(),
##   ..   rikishi1_id = col_double(),
##   ..   rikishi1_rank = col_character(),
##   ..   rikishi1_shikona = col_character(),
##   ..   rikishi1_result = col_character(),
##   ..   rikishi1_win = col_double(),
##   ..   kimarite = col_character(),
##   ..   rikishi2_id = col_double(),
##   ..   rikishi2_rank = col_character(),
##   ..   rikishi2_shikona = col_character(),
##   ..   rikishi2_result = col_character(),
##   ..   rikishi2_win = col_double()
##   .. )

Betting odds I’ve been scraping off marathonbet.com since May tournament of 2017:

"odds.csv" %>% 
    read_csv() %>% 
    str()
## Parsed with column specification:
## cols(
##   rikishi1 = col_character(),
##   odds1 = col_double(),
##   rikishi2 = col_character(),
##   odds2 = col_double(),
##   ts = col_datetime(format = "")
## )
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 13836 obs. of  5 variables:
##  $ rikishi1: chr  "Arawashi" "Daishomaru" "Goeido" "Hakuho" ...
##  $ odds1   : num  1.8 1.9 1.3 1.1 1.4 1.7 1.6 1.53 1.35 1.26 ...
##  $ rikishi2: chr  "Ura" "Onosho" "Okinoumi" "Chiyonokuni" ...
##  $ odds2   : num  2.01 1.9 3.52 7 2.96 2.15 2.34 2.51 3.2 3.86 ...
##  $ ts      : POSIXct, format: "2017-05-13 09:00:01" "2017-05-13 09:00:01" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   rikishi1 = col_character(),
##   ..   odds1 = col_double(),
##   ..   rikishi2 = col_character(),
##   ..   odds2 = col_double(),
##   ..   ts = col_datetime(format = "")
##   .. )

Examples of what can be done with these data will follow.