Introduction

I am going through the CS109 Harvard lectures on data science. I just watched a couple of lectures on web-scraping with BeautifulSoup, so I wanted to practice. I decided to scrape squash ranking data from wikipedia, as I am a avid fan of the sport. On wikipedia, the best information I could find was the top 10 players at the end of each year for the past 25 years or so.

Results

The results of the scraping process are in the following tables, summarising some key stats for the players. The tables are ordered by players’ average rank in the dataset. It is worth emphasising that the data only contains information on Top 10 rankings and only at the end of each year; this will skew the data in various ways.

Summary for female players

|                  player | average_rank | years_in_top10 | best_rank | worst_rank | earliest_year | latest_year |
|------------------------:|-------------:|---------------:|----------:|-----------:|--------------:|------------:|
|         Michelle Martin |     1.400000 |              5 |         1 |          2 |          1994 |        1998 |
|       Sarah Fitz-Gerald |     2.333333 |              9 |         1 |          5 |          1994 |        2002 |
|             Nicol David |     2.357143 |             14 |         1 |          6 |          2004 |        2017 |
|       Raneem El Weleily |     2.444444 |              9 |         1 |          7 |          2011 |        2019 |
|          Leilani Rorani |     2.750000 |              4 |         1 |          7 |          1998 |        2001 |
|        Nour El Sherbini |     3.142857 |              7 |         1 |          6 |          2012 |        2019 |
|         Rachael Grinham |     3.909091 |             11 |         1 |          8 |          2001 |        2011 |
|             Carol Owens |     4.100000 |             10 |         1 |          8 |          1994 |        2003 |
|           Laura Massaro |     4.111111 |              9 |         2 |          9 |          2010 |        2018 |
|          Cassie Jackman |     4.545455 |             11 |         2 |          8 |          1994 |        2004 |
|         Natalie Grinham |     4.666667 |              9 |         2 |          9 |          2003 |        2013 |
|        Natalie Grainger |     5.100000 |             10 |         3 |          7 |          1999 |        2009 |
|              Sue Wright |     5.250000 |              4 |         4 |          8 |          1994 |        1998 |
|           Jenny Duncalf |     5.375000 |              8 |         2 |          9 |          2005 |        2013 |
|            Nouran Gohar |     5.400000 |              5 |         3 |          9 |          2015 |        2019 |
|           Linda Elriani |     5.555556 |              9 |         3 |          9 |          1997 |        2005 |
|          Suzanne Horner |     5.625000 |              8 |         2 |          9 |          1994 |        2001 |
|            Tania Bailey |     5.666667 |              6 |         5 |          9 |          1999 |        2007 |
|        Vanessa Atkinson |     5.875000 |              8 |         1 |         10 |          2002 |        2010 |
|           Camille Serme |     5.888889 |              9 |         3 |         10 |          2010 |        2019 |
|           Nour El Tayeb |     6.000000 |              5 |         3 |          8 |          2014 |        2019 |
|             Kasey Brown |     6.000000 |              2 |         5 |          7 |          2010 |        2011 |
|              Liz Irving |     6.200000 |              5 |         3 |         10 |          1994 |        1998 |
|           Alison Waters |     6.400000 |             10 |         3 |         10 |          2008 |        2018 |
|             Joelle King |     6.666667 |              6 |         4 |         10 |          2012 |        2019 |
|         Vicky Botwright |     7.000000 |              4 |         5 |          9 |          2003 |        2007 |
|            Amanda Sobhy |     7.000000 |              2 |         7 |          7 |          2016 |        2019 |
|        Sarah-Jane Perry |     7.000000 |              3 |         6 |          8 |          2017 |        2019 |
|            Low Wee Wern |     7.333333 |              3 |         7 |          8 |          2012 |        2014 |
|          Madeline Perry |     7.428571 |              7 |         4 |         10 |          2006 |        2013 |
|          Sabine Schoene |     7.500000 |              4 |         6 |          9 |          1995 |        1998 |
|       Omneya Abdel Kawy |     7.900000 |             10 |         4 |         10 |          2004 |        2016 |
|            Fiona Geaves |     8.444444 |              9 |         5 |         10 |          1994 |        2004 |
| Laura Lengthorn-Massaro |     8.500000 |              2 |         8 |          9 |          2008 |        2009 |
|                Annie Au |     8.750000 |              4 |         8 |         10 |          2011 |        2015 |
|             Tesni Evans |     9.000000 |              2 |         9 |          9 |          2018 |        2019 |
|          Rebecca Macree |     9.250000 |              4 |         8 |         10 |          2001 |        2004 |
|         Stephanie Brind |     9.250000 |              4 |         7 |         10 |          1999 |        2003 |
|            Claire Nitch |     9.333333 |              3 |         9 |         10 |          1994 |        1996 |
|             Jane Martin |     9.500000 |              2 |         9 |         10 |          1994 |        1995 |
|        Hania El Hammamy |    10.000000 |              1 |        10 |         10 |          2019 |        2019 |
|         Shelley Kitchen |    10.000000 |              2 |        10 |         10 |          2007 |        2008 |
|         Dipika Pallikal |    10.000000 |              1 |        10 |         10 |          2012 |        2012 |

Summary for male players

|                 player | average_rank | years_in_top10 | best_rank | worst_rank | earliest_year | latest_year |
|-----------------------:|-------------:|---------------:|----------:|-----------:|--------------:|------------:|
|            Peter Nicol |     2.400000 |             10 |         1 |          8 |          1996 |        2005 |
|              Ali Farag |     3.250000 |              4 |         1 |          7 |          2016 |        2019 |
|           Jansher Khan |     3.333333 |              3 |         1 |          8 |          1996 |        1998 |
|         Jonathon Power |     3.666667 |              9 |         1 |          9 |          1997 |        2005 |
|    Mohamed El Shorbagy |     3.700000 |             10 |         1 |         10 |          2010 |        2019 |
|            Ramy Ashour |     3.909091 |             11 |         1 |          7 |          2006 |        2016 |
|       Grégory Gaultier |     4.000000 |             15 |         1 |         10 |          2003 |        2018 |
|           Ahmed Barada |     4.250000 |              4 |         2 |          7 |          1997 |        2000 |
|           Rodney Eyles |     4.333333 |              3 |         2 |          7 |          1996 |        1998 |
|           Nick Matthew |     4.642857 |             14 |         1 |         10 |          2004 |        2017 |
|           David Palmer |     4.727273 |             11 |         1 |          9 |          2000 |        2011 |
|            Amr Shabana |     4.727273 |             11 |         1 |          9 |          2004 |        2014 |
|           Paul Johnson |     5.000000 |              3 |         4 |          7 |          1998 |        2000 |
|        Stewart Boswell |     5.000000 |              2 |         4 |          6 |          2001 |        2002 |
|         Thierry Lincou |     5.200000 |             10 |         1 |          9 |          2001 |        2010 |
|        James Willstrop |     5.272727 |             11 |         1 |         10 |          2005 |        2017 |
|       Anthony Ricketts |     5.500000 |              4 |         3 |          7 |          2002 |        2006 |
|          Karim Darwish |     5.600000 |             10 |         1 |          9 |          2003 |        2013 |
|      Karim Abdel Gawad |     5.666667 |              6 |         2 |          9 |          2015 |        2019 |
|           Martin Heath |     5.666667 |              3 |         5 |          6 |          1998 |        2000 |
|             Del Harris |     6.000000 |              2 |         6 |          6 |          1996 |        1997 |
|             Dan Jenson |     6.000000 |              1 |         6 |          6 |          1998 |        1998 |
|     Marwan El Shorbagy |     6.250000 |              4 |         5 |          9 |          2016 |        2019 |
|             John White |     6.571429 |              7 |         2 |         10 |          1999 |        2007 |
|              Paul Coll |     6.666667 |              3 |         5 |          8 |          2017 |        2019 |
|            Omar Mosaad |     6.666667 |              3 |         4 |          8 |          2012 |        2016 |
|            Tarek Momen |     6.800000 |              5 |         4 |         10 |          2014 |        2019 |
|           Lee Beachill |     6.800000 |              5 |         1 |         10 |          2002 |        2006 |
| Miguel Ángel Rodríguez |     7.000000 |              3 |         5 |         10 |          2015 |        2019 |
|            Diego Elias |     7.000000 |              1 |         7 |          7 |          2019 |        2019 |
|       Stefan Casteleyn |     7.000000 |              1 |         7 |          7 |          1999 |        1999 |
|            David Evans |     7.000000 |              2 |         4 |         10 |          2000 |        2001 |
|            Simon Parke |     7.000000 |              5 |         3 |         10 |          1996 |        2000 |
|          Craig Rowland |     7.000000 |              1 |         7 |          7 |          1996 |        1996 |
|           Chris Walker |     7.000000 |              2 |         4 |         10 |          1996 |        1997 |
|           Brett Martin |     7.000000 |              2 |         5 |          9 |          1996 |        1997 |
|            Borja Golán |     7.000000 |              2 |         7 |          7 |          2013 |        2014 |
|           Peter Barker |     7.142857 |              7 |         5 |          9 |          2008 |        2014 |
|           Anthony Hill |     7.333333 |              3 |         5 |          9 |          1996 |        1999 |
|           Simon Rösner |     7.500000 |              6 |         3 |         10 |          2014 |        2019 |
|           Ong Beng Hee |     7.666667 |              3 |         7 |          8 |          2001 |        2003 |
|          Mark Chaloner |     8.666667 |              3 |         8 |         10 |          1996 |        2002 |
|     Laurens Jan Anjema |     9.000000 |              1 |         9 |          9 |          2010 |        2010 |
|             Alex Gough |     9.000000 |              2 |         9 |          9 |          1999 |        2000 |
|          Wael El Hindi |     9.000000 |              3 |         8 |         10 |          2007 |        2009 |
|      Mathieu Castagnet |     9.000000 |              2 |         9 |          9 |          2015 |        2016 |
|             Paul Price |     9.500000 |              2 |         9 |         10 |          2000 |        2001 |
|             Derek Ryan |    10.000000 |              1 |        10 |         10 |          1998 |        1998 |
|    Mohd Azlan Iskandar |    10.000000 |              1 |        10 |         10 |          2011 |        2011 |
|     Mohamed Abouelghar |    10.000000 |              1 |        10 |         10 |          2018 |        2018 |
|            Daryl Selby |    10.000000 |              2 |        10 |         10 |          2012 |        2013 |

Patterns and observations

It is satisfying to be able to compare the various big names in squash.

Like I said earlier, the data is skewed in various ways.

Next steps

I will see if I can find more detailed ranking information, so I can get a more fine-grained analysis of the players. A project I have in the back of my mind is to create my own ranking system based on match history, and see if I can create a system which is more predictive than the current system. I think this should be possible, but again, I will need to see if I can obtain the relevant data.

The code

import requests
from bs4 import BeautifulSoup
from IPython.core.display import HTML
import pandas as pd
import numpy as np

urls = ['https://en.wikipedia.org/wiki/Official_Women%27s_Squash_World_Ranking',
    'https://en.wikipedia.org/wiki/Official_Men%27s_Squash_World_Ranking']

def is_not_numeric(s):
    try:
        float(s)
    except ValueError:
        return True
    else:
        return False

def table_to_pandas(table):
    rows = table.find_all('tr')
    headers = [col.text.replace('\n', '') for col in rows[0].find_all('th')[1:]]
    
    n_cols = len(rows[-1].find_all('td'))    
    
    data = [
            [col.text.replace('\n','')
             for col in row.find_all('td')[1:]
             if is_not_numeric(col.text.replace('\n',''))
            ]
            for row in rows[1:]
           ]
    
    return pd.DataFrame(data, columns = headers, index = range(1,11))

def url_to_pandas(url):
    """
    headers contains id needed to cut html into two pieces
    """
    html = requests.get(url).text
    start = html.find('id="Year_end_world_top_10_players')
    end = html.find('id="Year-end_number_1')
    tables = BeautifulSoup(html[start:end], 'html.parser').find_all('table')

    df = pd.concat([table_to_pandas(t) for t in tables], axis=1)
    df_stack = df.stack().reset_index()
    df_stack.columns = ['rank', 'year', 'player']
    
    return df_stack

def player_summaries(df):
#     years_in_top10 = df.player.value_counts()
    players = ( df.groupby('player')
               .agg({'rank': [np.mean, 'count', np.min, np.max],
                     'year': [np.min, np.max]
                    })
              )

    players.columns = ['average_rank', 'years_in_top10','best_rank', 'worst_rank',
                       'earliest_year','latest_year']
    players.sort_values(by = ['average_rank'], inplace = True)
    
    return players

df_f, df_m = [url_to_pandas(urls[i]) for i in range(2)]
players_f = player_summaries(df_f)
players_m = player_summaries(df_m)