Yesterday I spent a good chunk of time actual coding and getting things to work.
Basically, I wrote a script that scrapes all the season data for each team, as well as the Per Game data for every player. Right now it is just printing the info but the next step is to use sqlite3 to inject into a DB.
For anyone who cares, here's the script that grabs the info for Player Per Game. I still have to figure out how I'm going to do it across different seasons (i.e., handle players who switch teams)
Code:
from bs4 import BeautifulSoup
from urllib.request import urlopen
def grabTeam(url):
soup = BeautifulSoup(urlopen(url))
pergame = soup.find("table", id = "per_game")
mydata = pergame.find("tbody")
return mydata
def grabPlayers(per_game):
num = len(per_game.find_all('tr'))
count = 0
while (count<num):
i = 0
while (i<24):
print (per_game.td.string)
per_game.td.extract()
i+=1
count+=1
per_game.tr.extract()
print()
teams = ['ATL', 'BOS', 'BRK', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOH', 'NYK', 'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']
team = ""
x = 0
while (x<30):
team_url = "http://www.basketball-reference.com/teams/"+teams[x]+"/2013.html"
grabPlayers(grabTeam(team_url))
x+=1
And seasonal info:
Code:
from bs4 import BeautifulSoup
from urllib.request import urlopen
def grabTeam(url):
soup = BeautifulSoup(urlopen(url))
pergame = soup.find("table", id = "team_stats")
mydata = pergame.find("tbody")
return mydata
def grabStats(team_stats):
count = 0
while (count<2):
team_stats.td.extract()
team_stats.td.extract()
team_stats.td.extract()
i = 0
while (i<19):
print (team_stats.td.string)
team_stats.td.extract()
i+=1
count+=1
team_stats.tr.extract()
team_stats.tr.extract()
print()
teams = ['ATL', 'BOS', 'NJN', 'CHA', 'CHI', 'CLE', 'DAL', 'DEN', 'DET', 'GSW', 'HOU', 'IND', 'LAC', 'LAL', 'MEM', 'MIA', 'MIL', 'MIN', 'NOH', 'NYK', 'OKC', 'ORL', 'PHI', 'PHO', 'POR', 'SAC', 'SAS', 'TOR', 'UTA', 'WAS']
team = ""
year = 2000
while (year<2013):
x = 0
while (x<30):
try:
team_url = "http://www.basketball-reference.com/teams/"+teams[x]+"/"+str(year)+".html"
print(teams[x])
grabStats(grabTeam(team_url))
x+=1
except(AttributeError):
x+=1
pass
year+=1
The exception above is there to handle situations where, the Charlotte Bobcats did not exist pre-2005. I have a lot of tweaking to do, just coded it this way at the moment so that the entire program would run through 2000->2013.
Thoughts and suggestions are more than welcome