slight link extraction refactor

This commit is contained in:
Matt Behrens 2018-01-26 20:19:25 -05:00
parent 33e7da196f
commit 8e451e9d62
5 changed files with 41 additions and 4 deletions

1
.gitignore vendored
View file

@ -3,5 +3,6 @@
*.mp4 *.mp4
*.pyc *.pyc
*.secret *.secret
.cache
.vscode .vscode
__pycache__ __pycache__

View file

@ -16,3 +16,4 @@ youtube-dl = "==2017.11.26"
[dev-packages] [dev-packages]
"e1839a8" = {path = ".", editable = true} "e1839a8" = {path = ".", editable = true}
pytest = "*"

29
Pipfile.lock generated
View file

@ -1,7 +1,7 @@
{ {
"_meta": { "_meta": {
"hash": { "hash": {
"sha256": "fc88391d1ef7a5701ec70beb432898e855e039105cb787bf9dd91ba3aa35c604" "sha256": "971f24d8efade8b46ac288d607d6200a9ecab3a67a81b3f717d38baf720be05d"
}, },
"host-environment-markers": { "host-environment-markers": {
"implementation_name": "cpython", "implementation_name": "cpython",
@ -153,6 +153,13 @@
} }
}, },
"develop": { "develop": {
"attrs": {
"hashes": [
"sha256:a17a9573a6f475c99b551c0e0a812707ddda1ec9653bed04c13841404ed6f450",
"sha256:1c7960ccfd6a005cd9f7ba884e6316b5e430a3f1a6c37c5f87d8b43f83b54ec9"
],
"version": "==17.4.0"
},
"certifi": { "certifi": {
"hashes": [ "hashes": [
"sha256:14131608ad2fd56836d33a71ee60fa1c82bc9d2c8d98b7bdbc631fe1b3cd1296", "sha256:14131608ad2fd56836d33a71ee60fa1c82bc9d2c8d98b7bdbc631fe1b3cd1296",
@ -232,6 +239,26 @@
], ],
"version": "==1.2.1" "version": "==1.2.1"
}, },
"pluggy": {
"hashes": [
"sha256:7f8ae7f5bdf75671a718d2daf0a64b7885f74510bcd98b1a0bb420eb9a9d0cff"
],
"version": "==0.6.0"
},
"py": {
"hashes": [
"sha256:8cca5c229d225f8c1e3085be4fcf306090b00850fefad892f9d96c7b6e2f310f",
"sha256:ca18943e28235417756316bfada6cd96b23ce60dd532642690dcfdaba988a76d"
],
"version": "==1.5.2"
},
"pytest": {
"hashes": [
"sha256:b84878865558194630c6147f44bdaef27222a9f153bbd4a08908b16bf285e0b1",
"sha256:53548280ede7818f4dc2ad96608b9f08ae2cc2ca3874f2ceb6f97e3583f25bc4"
],
"version": "==3.3.2"
},
"python-dateutil": { "python-dateutil": {
"hashes": [ "hashes": [
"sha256:95511bae634d69bc7329ba55e646499a842bc4ec342ad54a8cdb65645a0aad3c", "sha256:95511bae634d69bc7329ba55e646499a842bc4ec342ad54a8cdb65645a0aad3c",

View file

@ -85,8 +85,8 @@ def stream(api_base_url):
def extract_tags(toot): def extract_tags(toot):
return [tag['name'] for tag in toot['tags']] return [tag['name'] for tag in toot['tags']]
def has_external_link_class(class_string): def link_is_internal(link):
classes = class_string.split(' ') classes = link.attrib.get('class', '').split(' ')
if classes: if classes:
return 'mention' in classes return 'mention' in classes
@ -95,7 +95,7 @@ def has_external_link_class(class_string):
def extract_links(toot): def extract_links(toot):
html = HTML(toot['content']) html = HTML(toot['content'])
all_links = html.cssselect('a') all_links = html.cssselect('a')
return [link.attrib['href'] for link in all_links if not has_external_link_class(link.attrib.get('class', ''))] return [link.attrib['href'] for link in all_links if not link_is_internal(link)]
def main(): def main():
from getpass import getpass from getpass import getpass

8
test_fediplay.py Normal file
View file

@ -0,0 +1,8 @@
import fediplay
def test_extract_links():
toot = {
'content': "<p><a href=\"https://cybre.space/tags/nowplaying\" class=\"mention hashtag\" rel=\"tag\">#<span>nowplaying</span></a> <a href=\"https://cybre.space/tags/fediplay\" class=\"mention hashtag\" rel=\"tag\">#<span>fediplay</span></a> Grimes ft. Janelle Mon\u00e1e - Venus Fly <a href=\"https://www.youtube.com/watch?v=eTLTXDHrgtw\" rel=\"nofollow noopener\" target=\"_blank\"><span class=\"invisible\">https://www.</span><span class=\"ellipsis\">youtube.com/watch?v=eTLTXDHrgt</span><span class=\"invisible\">w</span></a></p>"
}
urls = fediplay.extract_links(toot)
assert urls == ['https://www.youtube.com/watch?v=eTLTXDHrgtw']