184 lines
7.1 KiB
Python
184 lines
7.1 KiB
Python
import asyncio
|
|
import math
|
|
import re
|
|
import urllib
|
|
from typing import Tuple
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
class ScrapingMixin:
|
|
async def artist_top(self, ctx, period, artistname, datatype, name):
|
|
"""Scrape either top tracks or top albums from lastfm library page."""
|
|
if not self.login_token:
|
|
return None, []
|
|
url = (
|
|
f"https://last.fm/user/{name}/library/music/{artistname}/"
|
|
f"+{datatype}?date_preset={self.period_http_format(period)}"
|
|
)
|
|
data = await self.fetch(ctx, url, handling="text")
|
|
soup = BeautifulSoup(data, "html.parser")
|
|
data = []
|
|
try:
|
|
chartlist = soup.find("tbody", {"data-playlisting-add-entries": ""})
|
|
except ValueError:
|
|
return None, []
|
|
|
|
artist = {
|
|
"image_url": soup.find("span", {"class": "library-header-image"})
|
|
.find("img")
|
|
.get("src")
|
|
.replace("avatar70s", "avatar300s"),
|
|
"formatted_name": soup.find("a", {"class": "library-header-crumb"}).text.strip(),
|
|
}
|
|
|
|
items = chartlist.findAll("tr", {"class": "chartlist-row"})
|
|
for item in items:
|
|
name = item.find("td", {"class": "chartlist-name"}).find("a").get("title")
|
|
playcount = (
|
|
item.find("span", {"class": "chartlist-count-bar-value"})
|
|
.text.replace("scrobbles", "")
|
|
.replace("scrobble", "")
|
|
.strip()
|
|
)
|
|
data.append((name, int(playcount.replace(",", ""))))
|
|
|
|
return artist, data
|
|
|
|
async def lyrics_musixmatch(self, artistsong) -> Tuple[str, str]:
|
|
artistsong = re.sub("[^a-zA-Z0-9 \n.]", "", artistsong)
|
|
artistsong = re.sub(r"\s+", " ", artistsong).strip()
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (X11; Arch Linux; Linux x86_64; rv:66.0) Gecko/20100101 Firefox/66.0"
|
|
}
|
|
async with self.session.get(
|
|
"https://musixmatch.com/search/{}".format(artistsong).replace(" ", "%20"),
|
|
headers=headers,
|
|
) as resp:
|
|
if resp.status == 200:
|
|
result = await resp.text()
|
|
else:
|
|
return None, None
|
|
soup = BeautifulSoup(result, "html.parser")
|
|
songurl = soup.find("a", {"class": "title"})
|
|
if songurl is None:
|
|
return None, None
|
|
url = "https://www.musixmatch.com" + songurl["href"]
|
|
async with self.session.get(url, headers=headers) as resp:
|
|
result = await resp.text()
|
|
soup = BeautifulSoup(result, "html.parser")
|
|
lyrics = soup.text.split('"body":"')
|
|
lyrics = lyrics[0]
|
|
songname = lyrics.split("|")[0]
|
|
lyrics = lyrics.split('","language"')[0]
|
|
try:
|
|
lyrics = lyrics.split("languages")[1]
|
|
except IndexError:
|
|
return None, None
|
|
lyrics = lyrics.split("Report")[0]
|
|
lyrics = lyrics.replace("\\n", "\n")
|
|
lyrics = lyrics.replace("\\", "")
|
|
lyrics = lyrics.replace("&", "&")
|
|
lyrics = lyrics.replace("`", "'")
|
|
lyrics = lyrics.strip()
|
|
return lyrics, songname.strip()
|
|
|
|
async def scrape_artist_image(self, artist, ctx):
|
|
url = f"https://www.last.fm/music/{urllib.parse.quote_plus(artist)}/+images"
|
|
data = await self.fetch(ctx, url, handling="text")
|
|
|
|
soup = BeautifulSoup(data, "html.parser")
|
|
if soup is None:
|
|
return ""
|
|
image = soup.find("img", {"class": "image-list-image"})
|
|
if image is None:
|
|
try:
|
|
image = soup.find("li", {"class": "image-list-item-wrapper"}).find("a").find("img")
|
|
except AttributeError:
|
|
return ""
|
|
return image["src"].replace("/avatar170s/", "/300x300/") if image else ""
|
|
|
|
async def scrape_artists_for_chart(self, ctx, username, period, amount):
|
|
period_format_map = {
|
|
"7day": "LAST_7_DAYS",
|
|
"1month": "LAST_30_DAYS",
|
|
"3month": "LAST_90_DAYS",
|
|
"6month": "LAST_180_DAYS",
|
|
"12month": "LAST_365_DAYS",
|
|
"overall": "ALL",
|
|
}
|
|
tasks = []
|
|
url = f"https://www.last.fm/user/{username}/library/artists"
|
|
for i in range(1, math.ceil(amount / 50) + 1):
|
|
params = {"date_preset": period_format_map[period], "page": i}
|
|
task = asyncio.ensure_future(self.fetch(ctx, url, params, handling="text"))
|
|
tasks.append(task)
|
|
|
|
responses = await asyncio.gather(*tasks)
|
|
|
|
images = []
|
|
for data in responses:
|
|
if len(images) >= amount:
|
|
break
|
|
else:
|
|
soup = BeautifulSoup(data, "html.parser")
|
|
imagedivs = soup.findAll("td", {"class": "chartlist-image"})
|
|
images += [
|
|
div.find("img")["src"].replace("/avatar70s/", "/300x300/") for div in imagedivs
|
|
]
|
|
|
|
return images
|
|
|
|
async def get_similar_artists(self, artistname, ctx):
|
|
similar = []
|
|
url = f"https://last.fm/music/{artistname}"
|
|
data = await self.fetch(ctx, url, handling="text")
|
|
soup = BeautifulSoup(data, "html.parser")
|
|
for artist in soup.findAll("h3", {"class": "artist-similar-artists-sidebar-item-name"}):
|
|
similar.append(artist.find("a").text)
|
|
listeners = (
|
|
soup.find("li", {"class": "header-metadata-tnew-item--listeners"}).find("abbr").text
|
|
)
|
|
return similar, listeners
|
|
|
|
async def get_playcount_scraper(self, ctx, username, artistname, period):
|
|
url = (
|
|
f"https://last.fm/user/{username}/library/music/{artistname}"
|
|
f"?date_preset={self.period_http_format(period)}"
|
|
)
|
|
data = await self.fetch(ctx, url, handling="text")
|
|
soup = BeautifulSoup(data, "html.parser")
|
|
divs = soup.findAll(class_="metadata-display")
|
|
if not divs:
|
|
return 0
|
|
div = divs[0]
|
|
plays = div.get_text()
|
|
return int(plays.split(" ")[0].replace(",", ""))
|
|
|
|
async def get_playcount_track_scraper(self, ctx, username, artistname, trackname, period):
|
|
url = (
|
|
f"https://last.fm/user/{username}/library/music/{artistname}/_/{trackname}"
|
|
f"?date_preset={self.period_http_format(period)}"
|
|
)
|
|
data = await self.fetch(ctx, url, handling="text")
|
|
soup = BeautifulSoup(data, "html.parser")
|
|
divs = soup.findAll(class_="metadata-display")
|
|
if not divs:
|
|
return 0
|
|
div = divs[0]
|
|
plays = div.get_text()
|
|
return int(plays.split(" ")[0].replace(",", ""))
|
|
|
|
async def get_playcount_album_scraper(self, ctx, username, artistname, albumname, period):
|
|
url = (
|
|
f"https://last.fm/user/{username}/library/music/{artistname}/{albumname}"
|
|
f"?date_preset={self.period_http_format(period)}"
|
|
)
|
|
data = await self.fetch(ctx, url, handling="text")
|
|
soup = BeautifulSoup(data, "html.parser")
|
|
divs = soup.findAll(class_="metadata-display")
|
|
if not divs:
|
|
return 0
|
|
div = divs[0]
|
|
plays = div.get_text()
|
|
return int(plays.split(" ")[0].replace(",", ""))
|