[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[www] 01/02: better text extraction.
From: |
gnunet |
Subject: |
[www] 01/02: better text extraction. |
Date: |
Mon, 11 Nov 2019 22:18:35 +0100 |
This is an automated email from the git hooks/post-receive script.
ng0 pushed a commit to branch master
in repository www.
commit 97beb6b2a9d0722112f6de9636fe5a21fb76af76
Author: ng0 <address@hidden>
AuthorDate: Mon Nov 11 21:15:15 2019 +0000
better text extraction.
---
template.py | 22 +++++++++++++++++++---
1 file changed, 19 insertions(+), 3 deletions(-)
diff --git a/template.py b/template.py
index e2a689b..89e4856 100755
--- a/template.py
+++ b/template.py
@@ -33,8 +33,8 @@ from pathlib import Path
import hashlib
from bs4 import BeautifulSoup
from ruamel.yaml import YAML
+import html.parser
-# TODO: Turn repetition into a class.
env = jinja2.Environment(loader=jinja2.FileSystemLoader(
os.path.dirname(__file__)),
@@ -45,6 +45,22 @@ env = jinja2.Environment(loader=jinja2.FileSystemLoader(
autoescape=False)
+class extractText(html.parser.HTMLParser):
+ def __init__(self):
+ super(extractText, self).__init__()
+ self.result = []
+ def handle_data(self, data):
+ self.result.append(data)
+ def text_in(self):
+ return ''.join(self.result)
+
+
+def html2text(html):
+ k = extractText()
+ k.feed(html)
+ return k.text_in()
+
+
def localized(filename, locale, *args):
if len(args) == 0:
return "../" + locale + "/" + filename
@@ -165,7 +181,7 @@ def preview_text(filename, count):
for i in soup.findAll('p')[1]:
k.append(i)
b = ''.join(str(e) for e in k)
- text = b.replace("\n", "")
+ text = html2text(b.replace("\n", ""))
textreduced = (text[:count] + '...') if len(text) > count else (text +
'..')
return(textreduced)
@@ -271,7 +287,7 @@ def main():
conf=yaml.load(site_configfile)
for item in conf["newsposts"]:
- item['abstract'] = abstract_news(item['page'], 300)
+ item['abstract'] = abstract_news(item['page'], 1000)
print("generating template")
generate_site("template", conf)
print("generating news")
--
To stop receiving notification emails like this one, please contact
address@hidden.