library(WikidataR)
library(rtoot)
library(rtoot)
auth_setup() # authentification de l'utilisateur
"mastodon.social"
What type of token do you want?
1: public
2: user
2
Token of type "user" for instance mastodon.social is valid
<mastodon bearer token> for instance: mastodon.social of type: user
# documentation on Rtoot is available here https://www.rdocumentation.org/packages/rtoot/versions/0.3.4
id <- search_accounts("plazi_species")
print(id)
df <- get_account_statuses("109284766172512524", limit = 400L)
head(df$content, 1)
library(stringr)
library(dplyr)
Attachement du package : ‘dplyr’
Les objets suivants sont masqués depuis ‘package:stats’:
filter, lag
Les objets suivants sont masqués depuis ‘package:base’:
intersect, setdiff, setequal, union
pattern <- "(?<=</p><p>)(.*?)(?=</p><p>)"
df <- df %>%
mutate(species_name = str_extract(content, pattern))
#str_to_sentence(country_discovery$country_of_discovery)
head(df, 10)
items problématiques : https://mastodon.green/@plazi_species/113917317111853431 https://mastodon.green/@plazi_species/113916252425285117
library(stringr)
library(dplyr)
df <- df %>%
filter(!str_detect(species_name, 'Treatment:'))
head(df, 10)
NA
library(purrr)
library(rvest)
df$content <- map_chr(df$content, function(x) {
tryCatch({
read_html(x) %>% html_text()
}, error = function(e) {
# renvoie le message original si une erreur est rencontrée
return(x)
})
})
head(df$content, 10)
[1] "#NewSpecies!New running crab spiders from #china just ran in:Sinodromus fujianensisTreatment: treatment.plazi.org/id/41059653-9C0C-5EFD-91D5-09E2C260578BPublication: doi.org/10.3897/zookeys.1221.137930#ZooKeys_Journal #SinodromusFujianensis#FAIRdata#science #taxonomy #ecology #biodiversity #nature #wildlife #conservation #animals #invertebrates #arachnology #arachnids #spider"
[2] "#NewSpecies!New marine goby from #taiwan:Callogobius aquilusTreatment: treatment.plazi.org/id/03BF244D-9543-FFB8-33D7-2193FE85F892Publication: doi.org/10.11646/zootaxa.5550.1.16#Zootaxa #CallogobiusAquilus#FAIRdata#science #OA #openaccess #biology #taxonomy #ecology #biodiversity #nature #wildlife #conservation #animals #fish #TeamFish #ichthyology #goby"
[3] "#NewSpecies!New beetle from #korea just came in:Viettherchnus orszulikiTreatment: treatment.plazi.org/id/1523AC3A-F82E-2968-BDE9-86E7FAF0F83BPublication: doi.org/10.11646/zootaxa.5519.4.7#Zootaxa #ViettherchnusOrszuliki#FAIRdata#science #OA #openaccess #biology #taxonomy #ecology #biodiversity #nature #wildlife #conservation #animals #invertebrates #entomology #insects #coleoptera #beetles"
[4] "#NewSpecies!New freshwater crayfish from #indonesia just surfaced:Cherax pulverulentusTreatment: treatment.plazi.org/id/03DB8794-3B48-037D-419A-44C726243339Publication: doi.org/10.11646/zootaxa.5566.3.4#Zootaxa #CheraxPulverulentus#FAIRdata#science #OA #openaccess #biology #taxonomy #ecology #biodiversity #nature #wildlife #conservation #animals #invertebrates #arthropods #crustacea #crayfish #freshwater"
[5] "#NewSpecies!New harlequin frog from #peru just jumped in:Atelopus histrionicusTreatment: treatment.plazi.org/id/03E48797-FF95-FF8E-4081-7BA8FD38F9A7Publication: doi.org/10.11646/zootaxa.5571.1.1#Zootaxa #AtelopusHistrionicus#FAIRdata#science #OA #openaccess #biology #taxonomy #ecology #biodiversity #nature #wildlife #conservation #animals #herpetology #herps #amphibia #frogs #harlequin"
[6] "#NewSpecies!New planthopper from #vietnam:Pseudochoutagus lindaeTreatment: treatment.plazi.org/id/CB3887FB-7173-620A-FB48-F995FC1CFAC4Publication: doi.org/10.5852/ejt.2024.975.2769#ejtaxonomy #PseudochoutagusLindae#FAIRdata#science #OA #openaccess #biology #taxonomy #ecology #biodiversity #nature #wildlife #conservation #animals #invertebrates #entomology #insects #hemiptera #planthopper"
[7] "#NewSpecies!New snake from #ethiopia just sidled in:Boaedon broadleyiTreatment: treatment.plazi.org/id/03D887FD-FFDC-6D7D-FF18-0F58FE64CBCCPublication: doi.org/10.11646/zootaxa.5569.1.4 #Zootaxa #BoaedonBroadleyi#FAIRdata#science #OA #openaccess #biology #taxonomy #ecology #biodiversity #nature #wildlife #conservation #animals #herpetology #herps #reptilia #snake"
[8] "#NewSpecies!New rove beetle from #china just roved by:Hesperus pengiTreatment: treatment.plazi.org/id/612C87D2-185E-FFE0-EEFC-27C3FEFAAB1BPublication: doi.org/10.11646/zootaxa.5474.5.6#Zootaxa #HesperusPengi#FAIRdata#science #OA #openaccess #biology #taxonomy #ecology #biodiversity #nature #wildlife #conservation #animals #invertebrates #entomology #insects #coleoptera #beetles #rovebeetles"
[9] "#NewSpecies!New snakehead from #myanmar just surfaced:Channa pyrophthalmusTreatment: treatment.plazi.org/id/0386C114-1644-FFE5-FF67-FBFF3F97EFADPublication: doi.org/10.26107/RBZ-2024-0001#RafflesBulletinOfZoology #ChannaPyrophthalmus#FAIRdata#science #OA #openaccess #biology #taxonomy #ecology #biodiversity #nature #wildlife #conservation #animals #fish #TeamFish #fishofmastodon #fishfriday #ichthyology #snakehead"
[10] "#NewSpeciesA newly discovered fungus from #china for #FungiFriday:Trichoderma strophariensisTreatment: treatment.plazi.org/id/22D20AFB-BC41-5495-B40A-EB53EFFBE5CBPublication: doi.org/10.3897/mycokeys.110.134154#MycoKeys #TrichodermaStrophariensis#FAIRdata#science #OA #openaccess #biology #taxonomy #ecology #biodiversity #nature #wildlife #conservation #fungi #mushrooms #mycology"
library(stringr)
pattern <- "(?<=doi.org/)(.*?)(?=#)"
df <- df %>%
mutate(doi = str_extract(content, pattern))
#str_to_sentence(country_discovery$country_of_discovery)
head(df, 10)
library(stringr)
pattern <- "(?<=from #)(\\w+)"
df <- df %>%
mutate(
country_of_discovery = str_extract(content, pattern),
country_of_discovery = str_to_title(country_of_discovery)
)
head(df, 10)
df <- select(df, species_name, country_of_discovery, doi)
head(df,10)
location of discovery : P189 described by source : P1343
library(WikidataR)
library(WikidataQueryServiceR)
See ?WDQS for resources on Wikidata Query Service and SPARQL
Attachement du package : ‘WikidataQueryServiceR’
Les objets suivants sont masqués depuis ‘package:WikidataR’:
get_example, query_wikidata
library(dplyr)
Attachement du package : ‘dplyr’
Les objets suivants sont masqués depuis ‘package:stats’:
filter, lag
Les objets suivants sont masqués depuis ‘package:base’:
intersect, setdiff, setequal, union
# URL to this query on Wikidata query service
countries_qid <- query_wikidata('
SELECT ?item ?itemLabel
WHERE
{
?item wdt:P31 wd:Q6256 .
SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}')
Rows: 194 Columns: 2── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): item, itemLabel
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
qid_no_url <- gsub('[http://www.wikidata.org/entity/]', '', countries_qid$item) # extrait le qid de la colonne item et ajoute une colonne au tableau des pays
countries_qid <- as.data.frame(countries_qid) # fait de countries_qid2 un dataframe afin que mutate puisse fonctionner (ligne suivante)
countries_qid <- countries_qid %>%
mutate(
qid=qid_no_url,
country_of_discovery = paste(countries_qid$itemLabel)
)
head(countries_qid, 10)
df1 <- merge(df, countries_qid, by = "country_of_discovery")
df1 <- select(df1, species_name, qid, country_of_discovery, doi)
head(df1, 10)
NA
get_qid <- function(name) {
results <- find_item(name) # Search for the name in Wikidata
if (length(results) > 0) {
return(results[[1]]$id) # Extract QID of the first result
} else {
return(NA) # Return NA if no result is found
}
}
df1$QID_items <- sapply(df1$species_name, get_qid)
head(df1, 10)
NA
NA
NA
df2 <- df1[is.na(df1$QID_items),]
head(df2, 10)
NA
#df3 : items déjà créésdans Wikdata
df3 <- df1[!is.na(df1$QID_items),] # filter(df1 =="NA") won't do it (see here https://stackoverflow.com/questions/7980622/subset-of-rows-containing-na-missing-values-in-a-chosen-column-of-a-data-frame#7980765)
head(df3, 10)
NA
df2_sub <- slice_head(df2, n = 3, by = NULL)
df2_sub <- df2_sub %>%
mutate(
row_num = row_number(),
item = paste0("CREATE_",row_num),
Len = paste0(df2_sub$species_name),
P189 = paste0(df2_sub$qid),
P356 = paste0(df2_sub$doi),
)
df2_sub <- select(df2_sub, item, species_name, Len, P189, P356)
print(df2_sub)
NA
library(tidyr)
library(stringr)
import <- df2_sub %>%
select(item,
matches("^L", ignore.case = FALSE),
matches("^D", ignore.case = FALSE),
# if there are some Sitelinks to other Wiki pages
#matches("^S", ignore.case = FALSE),
matches("^P", ignore.case = FALSE)) %>%
pivot_longer(cols = 2:last_col(), names_to = "property", values_to = "value") %>%
# fix helper with two columns referring to the same property
mutate(property = str_remove(property, "_.*")) %>%
filter(!is.na(value)) %>%
distinct()
print(import)
library(WikidataR)
write_wikidata(
items = import$item,
properties = import$property,
values = import$value,
format = "api",
api.username = "Udo_Bolano",
api.token = "$2y$10$qy3Omn7Dn4DaXAD1QBWcs.dNjrKwlrc.cFX2rVZMyQ7oYt2xYSmJS", #REDACTED#
)
api.token = mettre ici le token récupéré sur https://quickstatements.toolforge.org/#/user
voir mode d’emploi ici : https://katharinabrunner.de/2022/06/wikibase-wikidata-etl-data-import-with-r/