import pandas as pd
import rdflib
# e.g. https://www.schachbund.de/download-dwz-daten.html
# default encoding is utf-8, in case of errors try e.g. latin-1 and cp1252
# see also https://docs.python.org/3/library/codecs.html#standard-encodings
players = pd.read_csv('spieler.csv', encoding = "latin-1")
players.head()
| ID | VKZ | Mgl-Nr | Status | Spielername | Geschlecht | Spielberechtigung | Geburtsjahr | Letzte-Auswertung | DWZ | Index | FIDE-Elo | FIDE-Titel | FIDE-ID | FIDE-Land | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10029745 | 10614 | 1081 | P | Carlsen,Magnus | M | E | 1990.0 | 202517.0 | 2841.0 | 105.0 | 2837.0 | GM | 1503014.0 | NOR |
| 1 | 10029745 | 40039 | 444 | A | Carlsen,Magnus | M | E | 1990.0 | 202517.0 | 2841.0 | 105.0 | 2837.0 | GM | 1503014.0 | NOR |
| 2 | 10598871 | 62236 | 1465 | A | Gukesh,Dommaraju | M | A | 2006.0 | 202505.0 | 2808.0 | 13.0 | 2776.0 | GM | 46616543.0 | IND |
| 3 | 10571147 | 10102 | 1185 | A | Abdusattorov,Nodirbek | M | A | 2004.0 | 202517.0 | 2796.0 | 19.0 | 2767.0 | GM | 14204118.0 | UZB |
| 4 | 10530520 | 23003 | 1120 | A | Erigaisi,Arjun | M | A | 2003.0 | 202517.0 | 2776.0 | 13.0 | 2782.0 | GM | 35009192.0 | IND |
# translate relevant column names to English; optional. VKZ = Vereinskennzahl = club ID
players.rename(columns={'ID': 'player_id','Spielername': 'player_name', 'VKZ': 'club_id'}, inplace=True)
players.head()
| player_id | club_id | Mgl-Nr | Status | player_name | Geschlecht | Spielberechtigung | Geburtsjahr | Letzte-Auswertung | DWZ | Index | FIDE-Elo | FIDE-Titel | FIDE-ID | FIDE-Land | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10029745 | 10614 | 1081 | P | Carlsen,Magnus | M | E | 1990.0 | 202517.0 | 2841.0 | 105.0 | 2837.0 | GM | 1503014.0 | NOR |
| 1 | 10029745 | 40039 | 444 | A | Carlsen,Magnus | M | E | 1990.0 | 202517.0 | 2841.0 | 105.0 | 2837.0 | GM | 1503014.0 | NOR |
| 2 | 10598871 | 62236 | 1465 | A | Gukesh,Dommaraju | M | A | 2006.0 | 202505.0 | 2808.0 | 13.0 | 2776.0 | GM | 46616543.0 | IND |
| 3 | 10571147 | 10102 | 1185 | A | Abdusattorov,Nodirbek | M | A | 2004.0 | 202517.0 | 2796.0 | 19.0 | 2767.0 | GM | 14204118.0 | UZB |
| 4 | 10530520 | 23003 | 1120 | A | Erigaisi,Arjun | M | A | 2003.0 | 202517.0 | 2776.0 | 13.0 | 2782.0 | GM | 35009192.0 | IND |
clubs = pd.read_csv('vereine.csv', encoding='latin-1')
clubs.head()
| ZPS | LV | Verband | Vereinname | |
|---|---|---|---|---|
| 0 | 10102 | 1 | 101 | SC Viernheim 1934 e.V. |
| 1 | 10104 | 1 | 101 | VLK Lampertheim |
| 2 | 10106 | 1 | 101 | SK 1962 Ladenburg |
| 3 | 10109 | 1 | 101 | SSC Altlußheim |
| 4 | 10111 | 1 | 101 | SF 1946 Brühl |
clubs.rename(columns={'ZPS': 'club_id', 'Vereinname': 'club_name'}, inplace=True)
clubs.head()
| club_id | LV | Verband | club_name | |
|---|---|---|---|---|
| 0 | 10102 | 1 | 101 | SC Viernheim 1934 e.V. |
| 1 | 10104 | 1 | 101 | VLK Lampertheim |
| 2 | 10106 | 1 | 101 | SK 1962 Ladenburg |
| 3 | 10109 | 1 | 101 | SSC Altlußheim |
| 4 | 10111 | 1 | 101 | SF 1946 Brühl |
# dataframes can be JOINed with the merge() function
df = players.merge(clubs, on='club_id')
df.head()
| player_id | club_id | Mgl-Nr | Status | player_name | Geschlecht | Spielberechtigung | Geburtsjahr | Letzte-Auswertung | DWZ | Index | FIDE-Elo | FIDE-Titel | FIDE-ID | FIDE-Land | LV | Verband | club_name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 10029745 | 10614 | 1081 | P | Carlsen,Magnus | M | E | 1990.0 | 202517.0 | 2841.0 | 105.0 | 2837.0 | GM | 1503014.0 | NOR | 1 | 106 | OSG Baden-Baden |
| 1 | 10571151 | 10614 | 1085 | A | Firouzja,Alireza | M | - | 2003.0 | 202517.0 | 2773.0 | 23.0 | 2766.0 | GM | 12573981.0 | FRA | 1 | 106 | OSG Baden-Baden |
| 2 | 10004479 | 10614 | 525 | A | Aronian,Levon | M | D | 1982.0 | 202517.0 | 2756.0 | 133.0 | 2742.0 | GM | 13300474.0 | USA | 1 | 106 | OSG Baden-Baden |
| 3 | 10266090 | 10614 | 658 | A | Caruana,Fabiano | M | D | 1992.0 | 202505.0 | 2743.0 | 146.0 | 2777.0 | GM | 2020009.0 | USA | 1 | 106 | OSG Baden-Baden |
| 4 | 10171051 | 10614 | 782 | A | Rapport,Richard | M | E | 1996.0 | 202517.0 | 2738.0 | 125.0 | 2714.0 | GM | 738590.0 | HUN | 1 | 106 | OSG Baden-Baden |
# iterating over df returns columns, not rows
# need to call iterrows() on the dataframe
for x in df.iterrows():
print(x)
# tuple of 2 elements
print(len(x))
# index, series
print(type(x[0]), type(x[1]))
break
(0, player_id 10029745 club_id 10614 Mgl-Nr 1081 Status P player_name Carlsen,Magnus Geschlecht M Spielberechtigung E Geburtsjahr 1990.0 Letzte-Auswertung 202517.0 DWZ 2841.0 Index 105.0 FIDE-Elo 2837.0 FIDE-Titel GM FIDE-ID 1503014.0 FIDE-Land NOR LV 1 Verband 106 club_name OSG Baden-Baden Name: 0, dtype: object) 2 <class 'int'> <class 'pandas.core.series.Series'>
Ready to generate N3 statements:
clubs = {}
pfx = """
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@base <http://my.org/> .
@prefix : <#> .
"""
data = pfx
from time import time
t0 = time()
g = rdflib.Graph()
for i, x in df.iterrows():
# player IDs and some names are numbers, convert to strings
# labeled series: we can use x.ID or x['ID']
player_id = str(x.player_id)
player_name = str(x.player_name)
club_id = x.club_id
# some club names contain double quotes
club_name = x.club_name.replace('"','')
data += ':p' + player_id + ' foaf:name "' + player_name + '" .'
data += ':p' + player_id + ' :inclub :c' + club_id + ' .'
clubs[club_id] = club_name
for k in clubs:
data += ':c' + k + ' foaf:name "' + clubs[k] + '" .'
g.parse(data=data, format='turtle')
print(time() - t0, 'seconds', len(g), 'triples')
52.49227571487427 seconds 192343 triples
Check some statements (not a reliable verification, but better than nothing):
q = """
SELECT ?player ?club
WHERE {
?pl :inclub ?cl .
?pl foaf:name ?player .
?cl foaf:name ?club .
} LIMIT 10
"""
for p, c in g.query(q):
print(p, c)
Ghafouri,Said Basel Schachfreunde Lünen 1993 e. V. Wördemann,Stefan Schachclub Caissa Bad Salzuflen e. V. Schmölz,Paul SK Marburg 1931/72 Melder,Hermann SK 1980 Gernsheim Izzo,Josua SSC Altlußheim Kitzhöfer,Josef Schachfreunde Josefsheim Bigge Gross,Francis Schachfreunde Tegernheim Neumann,Florian SK Lauffen Schubert,Jens TSG GutsMuths 1860 Quedlinburg Povilaitis,Vadimas Schachgemeinschaft Ahaus-Wessum 1998 e. V
Finally, the clubs and their member counts.
Note the similarity to SQL.
q = """
SELECT ?club ?clubname (COUNT(DISTINCT ?player) AS ?cnt)
WHERE {
?player :inclub ?club .
?club foaf:name ?clubname .
}
GROUP BY ?club
ORDER BY DESC(?cnt)
LIMIT 10
"""
for club, name, cnt in g.query(q):
print(club, name, cnt)
http://my.org/#cG0353 Schachzwerge Magdeburg 884 http://my.org/#c40023 Hamburger SK von 1830 eV 765 http://my.org/#c10614 OSG Baden-Baden 441 http://my.org/#c22001 FC Bayern München e.V. 381 http://my.org/#cA0221 Elmshorner SC von 1896 366 http://my.org/#c22059 SC Garching 1980 e.V. 287 http://my.org/#c30002 SC Kreuzberg e.V. 268 http://my.org/#c61325 Schachverein Mülheim-Nord 1931 e. V. 253 http://my.org/#c10422 Karlsruher SF 1853 246 http://my.org/#cA080I Lübecker SV von 1873 235