-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathapp.py
69 lines (54 loc) · 1.86 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import json
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
BASE_URL = "https://huggingface.co/papers"
page = requests.get(BASE_URL)
soup = BeautifulSoup(page.content, "html.parser")
h3s = soup.find_all("h3")
papers = []
def extract_abstraction(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
abstract = soup.find("div", {"class": "pb-8 pr-4 md:pr-16"}).text
time_element = soup.find("time")
datetime_str = time_element.get("datetime") if time_element else None
if datetime_str and not datetime_str.endswith("Z"):
datetime_str = f"{datetime_str}Z"
if abstract.startswith("Abstract\n"):
abstract = abstract[len("Abstract\n") :]
abstract = abstract.replace("\n", " ")
return abstract, datetime_str
for h3 in tqdm(h3s):
a = h3.find("a")
title = a.text
link = a["href"]
url = f"https://huggingface.co{link}"
try:
abstract, datetime_str = extract_abstraction(url)
except Exception as e:
print(f"Failed to extract abstract for {url}: {e}")
abstract, datetime_str = "", None
papers.append({"title": title, "url": url, "abstract": abstract, "date_published": datetime_str})
feed = {
"version": "https://jsonfeed.org/version/1",
"title": "Hugging Face Papers",
"home_page_url": BASE_URL,
"feed_url": "https://example.org/feed.json",
"items": sorted(
[
{
"id": p["url"],
"title": p["title"].strip(),
"content_text": p["abstract"].strip(),
"url": p["url"],
**({"date_published": p["date_published"]} if p["date_published"] else {}),
}
for p in papers
],
key=lambda x: x.get("date_published", ""),
reverse=True,
),
}
with open("feed.json", "w") as f:
json.dump(feed, f)