mirror of https://github.com/tildeclub/ttrv.git
506 lines
16 KiB
Python
506 lines
16 KiB
Python
import re
|
|
import logging
|
|
import mimetypes
|
|
import json
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
_logger = logging.getLogger(__name__)
|
|
|
|
|
|
class BaseMIMEParser(object):
|
|
"""
|
|
BaseMIMEParser can be sub-classed to define custom handlers for determining
|
|
the MIME type of external urls.
|
|
"""
|
|
pattern = re.compile(r'.*$')
|
|
|
|
@staticmethod
|
|
def get_mimetype(url):
|
|
"""
|
|
Guess based on the file extension.
|
|
|
|
Args:
|
|
url (text): Web url that was linked to by a reddit submission.
|
|
|
|
Returns:
|
|
modified_url (text): The url (or filename) that will be used when
|
|
constructing the command to run.
|
|
content_type (text): The mime-type that will be used when
|
|
constructing the command to run. If the mime-type is unknown,
|
|
return None and the program will fallback to using the web
|
|
browser.
|
|
"""
|
|
filename = url.split('?')[0]
|
|
filename = filename.split('#')[0]
|
|
content_type, _ = mimetypes.guess_type(filename)
|
|
return url, content_type
|
|
|
|
|
|
class OpenGraphMIMEParser(BaseMIMEParser):
|
|
"""
|
|
Open graph protocol is used on many web pages.
|
|
|
|
<meta property="og:image" content="https://xxxx.jpg?ig_cache_key=xxxxx" />
|
|
<meta property="og:video:secure_url" content="https://xxxxx.mp4" />
|
|
|
|
If the page is a video page both of the above tags will be present and
|
|
priority is given to video content.
|
|
|
|
see http://ogp.me
|
|
"""
|
|
pattern = re.compile(r'.*$')
|
|
|
|
@staticmethod
|
|
def get_mimetype(url):
|
|
page = requests.get(url)
|
|
soup = BeautifulSoup(page.content, 'html.parser')
|
|
for og_type in ['video', 'image']:
|
|
prop = 'og:' + og_type + ':secure_url'
|
|
tag = soup.find('meta', attrs={'property': prop})
|
|
if not tag:
|
|
prop = 'og:' + og_type
|
|
tag = soup.find('meta', attrs={'property': prop})
|
|
if tag:
|
|
return BaseMIMEParser.get_mimetype(tag.get('content'))
|
|
|
|
return url, None
|
|
|
|
|
|
class VideoTagMIMEParser(BaseMIMEParser):
|
|
"""
|
|
<video width="320" height="240" controls>
|
|
<source src="movie.mp4" res="HD" type="video/mp4">
|
|
<source src="movie.mp4" res="SD" type="video/mp4">
|
|
<source src="movie.ogg" type="video/ogg">
|
|
</video>
|
|
"""
|
|
pattern = re.compile(r'.*$')
|
|
|
|
@staticmethod
|
|
def get_mimetype(url):
|
|
page = requests.get(url)
|
|
soup = BeautifulSoup(page.content, 'html.parser')
|
|
|
|
# TODO: Handle pages with multiple videos
|
|
video = soup.find('video')
|
|
source = None
|
|
if video:
|
|
source = video.find('source', attr={'res': 'HD'})
|
|
source = source or video.find('source', attr={'type': 'video/mp4'})
|
|
source = source or video.find('source')
|
|
if source:
|
|
return source.get('src'), source.get('type')
|
|
else:
|
|
return url, None
|
|
|
|
|
|
class GfycatMIMEParser(BaseMIMEParser):
|
|
"""
|
|
Gfycat provides a primitive json api to generate image links. URLs can be
|
|
downloaded as either gif, mp4, webm, or mjpg. Mp4 was selected because it's
|
|
fast and works with VLC.
|
|
|
|
https://gfycat.com/api
|
|
|
|
https://gfycat.com/UntidyAcidicIberianemeraldlizard -->
|
|
https://giant.gfycat.com/UntidyAcidicIberianemeraldlizard.webm
|
|
"""
|
|
pattern = re.compile(r'https?://(www\.)?gfycat\.com/[^.]+$')
|
|
|
|
@staticmethod
|
|
def get_mimetype(url):
|
|
identifier = url.split('/')[-1]
|
|
api_url = 'https://api.gfycat.com/v1/gfycats/{}'.format(identifier)
|
|
resp = requests.get(api_url)
|
|
image_url = resp.json()['gfyItem']['mp4Url']
|
|
return image_url, 'video/mp4'
|
|
|
|
|
|
class YoutubeMIMEParser(BaseMIMEParser):
|
|
"""
|
|
Youtube videos can be streamed with vlc or downloaded with youtube-dl.
|
|
Assign a custom mime-type so they can be referenced in mailcap.
|
|
"""
|
|
pattern = re.compile(
|
|
r'(?:https?://)?(m\.)?(?:youtu\.be/|(?:www\.)?youtube\.com/watch'
|
|
r'(?:\.php)?\'?.*v=)([a-zA-Z0-9\-_]+)')
|
|
|
|
@staticmethod
|
|
def get_mimetype(url):
|
|
return url, 'video/x-youtube'
|
|
|
|
|
|
class VimeoMIMEParser(BaseMIMEParser):
|
|
"""
|
|
Vimeo videos can be streamed with vlc or downloaded with youtube-dl.
|
|
Assign a custom mime-type so they can be referenced in mailcap.
|
|
"""
|
|
pattern = re.compile(r'https?://(www\.)?vimeo\.com/\d+$')
|
|
|
|
@staticmethod
|
|
def get_mimetype(url):
|
|
return url, 'video/x-youtube'
|
|
|
|
|
|
class GifvMIMEParser(BaseMIMEParser):
|
|
"""
|
|
Special case for .gifv, which is a custom video format for imgur serves
|
|
as html with a special <video> frame. Note that attempting for download as
|
|
.webm also returns this html page. However, .mp4 appears to return the raw
|
|
video file.
|
|
"""
|
|
pattern = re.compile(r'.*[.]gifv$')
|
|
|
|
@staticmethod
|
|
def get_mimetype(url):
|
|
modified_url = url[:-4] + 'mp4'
|
|
return modified_url, 'video/mp4'
|
|
|
|
|
|
class RedditUploadsMIMEParser(BaseMIMEParser):
|
|
"""
|
|
Reddit uploads do not have a file extension, but we can grab the mime-type
|
|
from the page header.
|
|
"""
|
|
pattern = re.compile(r'https://i\.reddituploads\.com/.+$')
|
|
|
|
@staticmethod
|
|
def get_mimetype(url):
|
|
page = requests.head(url)
|
|
content_type = page.headers.get('Content-Type', '')
|
|
content_type = content_type.split(';')[0] # Strip out the encoding
|
|
return url, content_type
|
|
|
|
|
|
class RedditVideoMIMEParser(BaseMIMEParser):
|
|
"""
|
|
Reddit hosted videos/gifs.
|
|
Media uses MPEG-DASH format (.mpd)
|
|
"""
|
|
pattern = re.compile(r'https://v\.redd\.it/.+$')
|
|
|
|
@staticmethod
|
|
def get_mimetype(url):
|
|
request_url = url + '/DASHPlaylist.mpd'
|
|
page = requests.get(request_url)
|
|
soup = BeautifulSoup(page.content, 'html.parser')
|
|
if not soup.find('representation', attrs={'mimetype': 'audio/mp4'}):
|
|
reps = soup.find_all('representation', attrs={'mimetype': 'video/mp4'})
|
|
reps = sorted(reps, reverse=True, key=lambda t: int(t.get('bandwidth')))
|
|
if reps:
|
|
url_suffix = reps[0].find('baseurl')
|
|
if url_suffix:
|
|
return url + '/' + url_suffix.text, 'video/mp4'
|
|
|
|
return request_url, 'video/x-youtube'
|
|
|
|
|
|
class ImgurApiMIMEParser(BaseMIMEParser):
|
|
"""
|
|
Imgur now provides a json API exposing its entire infrastructure. Each Imgur
|
|
page has an associated hash and can either contain an album, a gallery,
|
|
or single image.
|
|
|
|
The default client token for TTRV is shared among users and allows a maximum
|
|
global number of requests per day of 12,500. If we find that this limit is
|
|
not sufficient for all of ttrv's traffic, this method will be revisited.
|
|
|
|
Reference:
|
|
https://apidocs.imgur.com
|
|
"""
|
|
CLIENT_ID = None
|
|
pattern = re.compile(
|
|
r'https?://(w+\.)?(m\.)?imgur\.com/'
|
|
r'((?P<domain>a|album|gallery)/)?(?P<hash>[a-zA-Z0-9]+)$')
|
|
|
|
@classmethod
|
|
def get_mimetype(cls, url):
|
|
|
|
endpoint = 'https://api.imgur.com/3/{domain}/{page_hash}'
|
|
headers = {'authorization': 'Client-ID {0}'.format(cls.CLIENT_ID)}
|
|
|
|
m = cls.pattern.match(url)
|
|
page_hash = m.group('hash')
|
|
|
|
if m.group('domain') in ('a', 'album'):
|
|
domain = 'album'
|
|
else:
|
|
# This could be a gallery or a single image, but there doesn't
|
|
# seem to be a way to reliably distinguish between the two.
|
|
# Assume a gallery, which appears to be more common, and fallback
|
|
# to an image request upon failure.
|
|
domain = 'gallery'
|
|
|
|
if not cls.CLIENT_ID:
|
|
return cls.fallback(url, domain)
|
|
|
|
api_url = endpoint.format(domain=domain, page_hash=page_hash)
|
|
r = requests.get(api_url, headers=headers)
|
|
|
|
if domain == 'gallery' and r.status_code != 200:
|
|
# Not a gallery, try to download using the image endpoint
|
|
api_url = endpoint.format(domain='image', page_hash=page_hash)
|
|
r = requests.get(api_url, headers=headers)
|
|
|
|
if r.status_code != 200:
|
|
_logger.warning('Imgur API failure, status %s', r.status_code)
|
|
return cls.fallback(url, domain)
|
|
|
|
data = r.json().get('data')
|
|
if not data:
|
|
_logger.warning('Imgur API failure, resp %s', r.json())
|
|
return cls.fallback(url, domain)
|
|
|
|
if 'images' in data and len(data['images']) > 1:
|
|
# TODO: handle imgur albums with mixed content, i.e. jpeg and gifv
|
|
link = ' '.join([d['link'] for d in data['images'] if not d['animated']])
|
|
mime = 'image/x-imgur-album'
|
|
else:
|
|
data = data['images'][0] if 'images' in data else data
|
|
# this handles single image galleries
|
|
|
|
link = data['mp4'] if data['animated'] else data['link']
|
|
mime = 'video/mp4' if data['animated'] else data['type']
|
|
|
|
link = link.replace('http://', 'https://')
|
|
return link, mime
|
|
|
|
@classmethod
|
|
def fallback(cls, url, domain):
|
|
"""
|
|
Attempt to use one of the scrapers if the API doesn't work
|
|
"""
|
|
if domain == 'album':
|
|
# The old Imgur album scraper has stopped working and I haven't
|
|
# put in the effort to figure out why
|
|
return url, None
|
|
else:
|
|
return ImgurScrapeMIMEParser.get_mimetype(url)
|
|
|
|
|
|
class ImgurScrapeMIMEParser(BaseMIMEParser):
|
|
"""
|
|
The majority of imgur links don't point directly to the image, so we need
|
|
to open the provided url and scrape the page for the link.
|
|
|
|
Scrape the actual image url from an imgur landing page. Imgur intentionally
|
|
obscures this on most reddit links in order to draw more traffic for their
|
|
advertisements.
|
|
|
|
There are a couple of <meta> tags that supply the relevant info:
|
|
<meta name="twitter:image" content="https://i.imgur.com/xrqQ4LEh.jpg">
|
|
<meta property="og:image" content="http://i.imgur.com/xrqQ4LE.jpg?fb">
|
|
<link rel="image_src" href="http://i.imgur.com/xrqQ4LE.jpg">
|
|
"""
|
|
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')
|
|
|
|
@staticmethod
|
|
def get_mimetype(url):
|
|
page = requests.get(url)
|
|
soup = BeautifulSoup(page.content, 'html.parser')
|
|
tag = soup.find('meta', attrs={'name': 'twitter:image'})
|
|
if tag:
|
|
url = tag.get('content')
|
|
if GifvMIMEParser.pattern.match(url):
|
|
return GifvMIMEParser.get_mimetype(url)
|
|
return BaseMIMEParser.get_mimetype(url)
|
|
|
|
|
|
class InstagramMIMEParser(OpenGraphMIMEParser):
|
|
"""
|
|
Instagram uses the Open Graph protocol
|
|
"""
|
|
pattern = re.compile(r'https?://(www\.)?instagr((am\.com)|\.am)/p/[^.]+$')
|
|
|
|
|
|
class StreamableMIMEParser(OpenGraphMIMEParser):
|
|
"""
|
|
Streamable uses the Open Graph protocol
|
|
"""
|
|
pattern = re.compile(r'https?://(www\.)?streamable\.com/[^.]+$')
|
|
|
|
|
|
class LiveleakMIMEParser(BaseMIMEParser):
|
|
"""
|
|
https://www.liveleak.com/view?i=12c_3456789
|
|
<video>
|
|
<source src="https://cdn.liveleak.com/..mp4" res="HD" type="video/mp4">
|
|
<source src="https://cdn.liveleak.com/..mp4" res="SD" type="video/mp4">
|
|
</video>
|
|
Sometimes only one video source is available
|
|
"""
|
|
pattern = re.compile(r'https?://((www|m)\.)?liveleak\.com/view\?i=\w+$')
|
|
|
|
@staticmethod
|
|
def get_mimetype(url):
|
|
page = requests.get(url)
|
|
soup = BeautifulSoup(page.content, 'html.parser')
|
|
|
|
urls = []
|
|
videos = soup.find_all('video')
|
|
for vid in videos:
|
|
source = vid.find('source', attr={'res': 'HD'})
|
|
source = source or vid.find('source')
|
|
if source:
|
|
urls.append((source.get('src'), source.get('type')))
|
|
|
|
# TODO: Handle pages with multiple videos
|
|
if urls:
|
|
return urls[0]
|
|
|
|
def filter_iframe(t):
|
|
return t.name == 'iframe' and 'youtube.com' in t['src']
|
|
|
|
iframe = soup.find_all(filter_iframe)
|
|
if iframe:
|
|
return YoutubeMIMEParser.get_mimetype(iframe[0]['src'].strip('/'))
|
|
|
|
return url, None
|
|
|
|
|
|
class ClippitUserMIMEParser(BaseMIMEParser):
|
|
"""
|
|
Clippit uses a video player container
|
|
"""
|
|
pattern = re.compile(r'https?://(www\.)?clippituser\.tv/c/.+$')
|
|
|
|
@staticmethod
|
|
def get_mimetype(url):
|
|
page = requests.get(url)
|
|
soup = BeautifulSoup(page.content, 'html.parser')
|
|
tag = soup.find(id='player-container')
|
|
if tag:
|
|
quality = ['data-{}-file'.format(_) for _ in ['hd', 'sd']]
|
|
new_url = tag.get(quality[0])
|
|
if new_url:
|
|
return new_url, 'video/mp4'
|
|
|
|
return url, None
|
|
|
|
|
|
class GifsMIMEParser(OpenGraphMIMEParser):
|
|
"""
|
|
Gifs.com uses the Open Graph protocol
|
|
"""
|
|
pattern = re.compile(r'https?://(www\.)?gifs\.com/gif/.+$')
|
|
|
|
|
|
class GiphyMIMEParser(OpenGraphMIMEParser):
|
|
"""
|
|
Giphy.com uses the Open Graph protocol
|
|
"""
|
|
pattern = re.compile(r'https?://(www\.)?giphy\.com/gifs/.+$')
|
|
|
|
|
|
class ImgflipMIMEParser(OpenGraphMIMEParser):
|
|
"""
|
|
imgflip.com uses the Open Graph protocol
|
|
"""
|
|
pattern = re.compile(r'https?://(www\.)?imgflip\.com/i/.+$')
|
|
|
|
|
|
class LivememeMIMEParser(OpenGraphMIMEParser):
|
|
"""
|
|
livememe.com uses the Open Graph protocol
|
|
"""
|
|
pattern = re.compile(r'https?://(www\.)?livememe\.com/[^.]+$')
|
|
|
|
|
|
class MakeamemeMIMEParser(OpenGraphMIMEParser):
|
|
"""
|
|
makeameme.com uses the Open Graph protocol
|
|
"""
|
|
pattern = re.compile(r'https?://(www\.)?makeameme\.org/meme/.+$')
|
|
|
|
|
|
class FlickrMIMEParser(OpenGraphMIMEParser):
|
|
"""
|
|
Flickr uses the Open Graph protocol
|
|
"""
|
|
# TODO: handle albums/photosets (https://www.flickr.com/services/api)
|
|
pattern = re.compile(r'https?://(www\.)?flickr\.com/photos/[^/]+/[^/]+/?$')
|
|
|
|
|
|
class StreamjaMIMEParser(VideoTagMIMEParser):
|
|
"""
|
|
Embedded HTML5 video element
|
|
"""
|
|
pattern = re.compile(r'https?://(www\.)?streamja\.com/[^/]+/?$')
|
|
|
|
|
|
class WorldStarHipHopMIMEParser(BaseMIMEParser):
|
|
"""
|
|
<video>
|
|
<source src="https://hw-mobile.worldstarhiphop.com/..mp4" type="video/mp4">
|
|
<source src="" type="video/mp4">
|
|
</video>
|
|
Sometimes only one video source is available
|
|
"""
|
|
pattern = re.compile(r'https?://((www|m)\.)?worldstarhiphop\.com/videos/video.php\?v=\w+$')
|
|
|
|
@staticmethod
|
|
def get_mimetype(url):
|
|
page = requests.get(url)
|
|
soup = BeautifulSoup(page.content, 'html.parser')
|
|
|
|
def filter_source(t):
|
|
return t.name == 'source' and t['src'] and t['type'] == 'video/mp4'
|
|
|
|
source = soup.find_all(filter_source)
|
|
if source:
|
|
return source[0]['src'], 'video/mp4'
|
|
|
|
def filter_iframe(t):
|
|
return t.name == 'iframe' and 'youtube.com' in t['src']
|
|
|
|
iframe = soup.find_all(filter_iframe)
|
|
if iframe:
|
|
return YoutubeMIMEParser.get_mimetype(iframe[0]['src'])
|
|
|
|
return url, None
|
|
|
|
|
|
class RedGifsParser(BaseMIMEParser):
|
|
"""
|
|
Extract from application/ld+json
|
|
"""
|
|
pattern = re.compile(r'https?://(www\.)?redgifs\.com/watch/.+$')
|
|
|
|
@staticmethod
|
|
def get_mimetype(url):
|
|
page = requests.get(url)
|
|
soup = BeautifulSoup(page.content, 'html.parser')
|
|
tag = soup.find('script', attrs={'type': 'application/ld+json'})
|
|
if tag:
|
|
ld_json = json.loads(tag.text)
|
|
return ld_json['video']['contentUrl'], 'video/mp4'
|
|
|
|
return url, None
|
|
|
|
|
|
# Parsers should be listed in the order they will be checked
|
|
parsers = [
|
|
StreamjaMIMEParser,
|
|
ClippitUserMIMEParser,
|
|
StreamableMIMEParser,
|
|
InstagramMIMEParser,
|
|
GfycatMIMEParser,
|
|
ImgurApiMIMEParser,
|
|
RedditUploadsMIMEParser,
|
|
RedditVideoMIMEParser,
|
|
YoutubeMIMEParser,
|
|
VimeoMIMEParser,
|
|
LiveleakMIMEParser,
|
|
FlickrMIMEParser,
|
|
GifsMIMEParser,
|
|
GiphyMIMEParser,
|
|
ImgflipMIMEParser,
|
|
LivememeMIMEParser,
|
|
MakeamemeMIMEParser,
|
|
WorldStarHipHopMIMEParser,
|
|
GifvMIMEParser,
|
|
RedGifsParser,
|
|
BaseMIMEParser]
|