Source code for mchartanalyzer.chartscraper

from datetime import datetime

import requests
from bs4 import BeautifulSoup

from . import constants
from .ultimateguitarstrategy import UltimateGuitarStrategy
from .databasehandler import DatabaseHandler
from .chartparser import ChartParser
from .objects.chartdata import ChartData

"""
Steps:

1. Get links for artist.
2. For each chord sheet, parse it for relevant data, and write the formatted chart out.
Potential filename format: artist-name_song-name_session-id.md
"""

[docs]class ChartScraper: def __init__(self): self.parser = ChartParser() self.dbHandler = DatabaseHandler() self.chordSheetLinks = [] self.scrapeStrategies = [] self.scrapeStrategies.append(UltimateGuitarStrategy()) self.testModeEnabled = False self.scrapeCooldownEnabled = True
[docs] def _isUrlValidTarget(self, url): """ Returns true when the given URL hasn't been scraped before, or if it was scraped a while ago. For our purposes, 30 days is the cooldown time for a URL. """ chartData = self.dbHandler.getChartByUrl(url) if chartData is None: return True dtScrape = datetime.strptime(chartData.updateTime, constants.DATETIME_FORMAT) dtNow = datetime.now() dtDifference = dtNow - dtScrape if(dtDifference.days > constants.URL_SCRAPE_COOLDOWN_DAYS): return True else: return False
[docs] def scrape(self, artistName): """ Scrapes websites for song charts by a given artist, then feeds that information to the parser. After scraping is complete, the parser analysis is triggered. """ scrapeSourceNames = [] artistSourceUrls = [] print("\nScraping for " + artistName + " songs...") # Set up artist information, then send it to the parser. for scrapeStrategy in self.scrapeStrategies: scrapeSourceNames.append(scrapeStrategy.getSourceName()) artistSourceUrls.append(scrapeStrategy.getArtistUrl(artistName)) self.parser.setArtistData(artistName, scrapeSourceNames, artistSourceUrls) # Scrape the chart sources for song charts, then call the parser for each one. for scrapeStrategy in self.scrapeStrategies: songUrls = scrapeStrategy.getSongUrlsForArtist(artistName) for index, songUrl in enumerate(songUrls): if self.testModeEnabled and index >= constants.TEST_MODE_SONG_LIMIT: break if self.scrapeCooldownEnabled and not self._isUrlValidTarget(songUrl): print("Skipping chart: " + songUrl) else: print("Parsing chart in: " + songUrl) resp = requests.get(songUrl) pageContent = resp.content soup = BeautifulSoup(pageContent, "html.parser") chartContentHtml = soup.select(".js-tab-content")[0] chartContent = chartContentHtml.get_text() self.parser.parseChart(scrapeStrategy.getSongTitle(soup), songUrl, chartContent) print("Scraping complete!")