From d55626d4452bf15849b3ead2266a2ca89f8d9c8d Mon Sep 17 00:00:00 2001 From: collerek Date: Mon, 13 Dec 2021 01:23:03 +0100 Subject: [PATCH] refactor and cleaning --- cps/metadata_provider/comicvine.py | 4 +- cps/metadata_provider/google.py | 12 +- cps/metadata_provider/lubimyczytac.py | 468 +++++++++++++------------- cps/services/Metadata.py | 33 +- requirements.txt | 1 + 5 files changed, 278 insertions(+), 240 deletions(-) diff --git a/cps/metadata_provider/comicvine.py b/cps/metadata_provider/comicvine.py index 8f496608..195e68f8 100644 --- a/cps/metadata_provider/comicvine.py +++ b/cps/metadata_provider/comicvine.py @@ -26,7 +26,7 @@ class ComicVine(Metadata): __name__ = "ComicVine" __id__ = "comicvine" - def search(self, query, __): + def search(self, query, generic_cover=""): val = list() apikey = "57558043c53943d5d1e96a9ad425b0eb85532ee6" if self.active: @@ -52,7 +52,7 @@ class ComicVine(Metadata): v['tags'] = ["Comics", seriesTitle] v['rating'] = 0 v['series'] = seriesTitle - v['cover'] = r['image'].get('original_url') + v['cover'] = r['image'].get('original_url', generic_cover) v['source'] = { "id": self.__id__, "description": "ComicVine Books", diff --git a/cps/metadata_provider/google.py b/cps/metadata_provider/google.py index f3d02d8e..8be8ad74 100644 --- a/cps/metadata_provider/google.py +++ b/cps/metadata_provider/google.py @@ -17,19 +17,20 @@ # along with this program. If not, see . # Google Books api document: https://developers.google.com/books/docs/v1/using - - import requests + from cps.services.Metadata import Metadata + class Google(Metadata): __name__ = "Google" __id__ = "google" + BASE_URL = "https://www.googleapis.com/books/v1/volumes?q=" - def search(self, query, __): + def search(self, query, generic_cover=""): if self.active: val = list() - result = requests.get("https://www.googleapis.com/books/v1/volumes?q="+query.replace(" ","+")) + result = requests.get(Google.BASE_URL + query.replace(" ","+")) for r in result.json()['items']: v = dict() v['id'] = r['id'] @@ -43,7 +44,8 @@ class Google(Metadata): if r['volumeInfo'].get('imageLinks'): v['cover'] = r['volumeInfo']['imageLinks']['thumbnail'].replace("http://", "https://") else: - v['cover'] = "/../../../static/generic_cover.jpg" + # v['cover'] = "/../../../static/generic_cover.jpg" + v['cover'] = generic_cover v['source'] = { "id": self.__id__, "description": "Google Books", diff --git a/cps/metadata_provider/lubimyczytac.py b/cps/metadata_provider/lubimyczytac.py index aab50bb6..ee66d1b4 100644 --- a/cps/metadata_provider/lubimyczytac.py +++ b/cps/metadata_provider/lubimyczytac.py @@ -15,47 +15,47 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import datetime import json import re -from typing import Dict, List +from typing import Dict, Generator, List, Optional, Tuple, Union from urllib.parse import quote import requests -from cps.services.Metadata import Metadata -from lxml.html import fromstring, tostring +from dateutil import parser +from html2text import HTML2Text +from lxml.html import HtmlElement, fromstring, tostring +from markdown2 import Markdown + +from cps.services.Metadata import MetaRecord, Metadata + +SYMBOLS_TO_TRANSLATE = ( + "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ", + "oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ", +) +SYMBOL_TRANSLATION_MAP = dict( + [(ord(a), ord(b)) for (a, b) in zip(*SYMBOLS_TO_TRANSLATE)] +) -def get_int_or_float(v): - number_as_float = float(v) +def get_int_or_float(value: str) -> Union[int, float]: + number_as_float = float(value) number_as_int = int(number_as_float) return number_as_int if number_as_float == number_as_int else number_as_float -def strip_accents(s): - if s is None: - return s - else: - symbols = ( - "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ", - "oOuUoOoOuUeEaAuUiIaAcCeElLnNoOsSzZzZ", - ) - tr = dict([(ord(a), ord(b)) for (a, b) in zip(*symbols)]) - return s.translate(tr) # .lower() +def strip_accents(s: Optional[str]) -> Optional[str]: + return s.translate(SYMBOL_TRANSLATION_MAP) if s is not None else s -def sanitize_comments_html(html): - from markdown2 import Markdown - +def sanitize_comments_html(html: str) -> str: text = html2text(html) md = Markdown() html = md.convert(text) return html -def html2text(html): - from html2text import HTML2Text - import re - +def html2text(html: str) -> str: # replace tags with as becomes emphasis in html2text if isinstance(html, bytes): html = html.decode("utf-8") @@ -92,26 +92,36 @@ class LubimyCzytac(Metadata): PUBLISHER = f"{CONTAINER}//dt[contains(text(),'Wydawnictwo:')]{SIBLINGS}/a/text()" LANGUAGES = f"{CONTAINER}//dt[contains(text(),'Język:')]{SIBLINGS}/text()" DESCRIPTION = f"{CONTAINER}//div[@class='collapse-content']" - SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]" + SERIES = f"{CONTAINER}//span/a[contains(@href,'/cykl/')]/text()" DETAILS = "//div[@id='book-details']" PUBLISH_DATE = "//dt[contains(@title,'Data pierwszego wydania" FIRST_PUBLISH_DATE = f"{DETAILS}{PUBLISH_DATE} oryginalnego')]{SIBLINGS}[1]/text()" FIRST_PUBLISH_DATE_PL = f"{DETAILS}{PUBLISH_DATE} polskiego')]{SIBLINGS}[1]/text()" TAGS = "//nav[@aria-label='breadcrumb']//a[contains(@href,'/ksiazki/k/')]/text()" + RATING = "//meta[@property='books:rating:value']/@content" COVER = "//meta[@property='og:image']/@content" + ISBN = "//meta[@property='books:isbn']/@content" + META_TITLE = "//meta[@property='og:description']/@content" SUMMARY = "//script[@type='application/ld+json']//text()" - def search(self, query, __): + def search(self, query: str, generic_cover: str = "") -> Optional[List]: if self.active: result = requests.get(self._prepare_query(title=query)) root = fromstring(result.text) - matches = self._parse_search_results(root=root) + lc_parser = LubimyCzytacParser(root=root, metadata=self) + matches = lc_parser.parse_search_results() if matches: - for ind, match in enumerate(matches): - matches[ind] = self._parse_single_book(match=match) + final_matches = [] + for match in matches: + response = requests.get(match.get("url")) + match = lc_parser.parse_single_book( + match=match, response=response, generic_cover=generic_cover + ) + final_matches.append(match) + return final_matches return matches def _prepare_query(self, title: str) -> str: @@ -128,9 +138,7 @@ class LubimyCzytac(Metadata): token for token in title.lower().split(" ") if len(token) > 1 ] else: - title_tokens = list( - self.get_title_tokens(title, strip_joiners=False, strip_subtitle=True) - ) + title_tokens = list(self.get_title_tokens(title, strip_joiners=False)) if title_tokens: tokens = [quote(t.encode("utf-8")) for t in title_tokens] query = query + "%20".join(tokens) @@ -138,215 +146,21 @@ class LubimyCzytac(Metadata): return "" return f"{LubimyCzytac.BASE_URL}/szukaj/ksiazki?phrase={query}" - def _parse_search_results(self, root) -> List[Dict]: - matches = [] - results = root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH) - for result in results: - title = result.xpath( - f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" - f"{LubimyCzytac.TITLE_TEXT_PATH}" - ) - book_url = result.xpath( - f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" f"{LubimyCzytac.URL_PATH}" - ) - authors = result.xpath( - f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" - f"{LubimyCzytac.AUTHORS_PATH}" - ) - - if not title or not book_url or not authors: - continue - title = title[0].strip() - book_url = LubimyCzytac.BASE_URL + book_url[0] - book_id = book_url.replace(f"{LubimyCzytac.BASE_URL}/ksiazka/", "").split( - "/" - )[0] - matches.append( - {"id": book_id, "title": title, "authors": authors, "url": book_url} - ) - return matches - - def _parse_single_book(self, match: Dict) -> Dict: - url = match.get("url") - result = requests.get(url) - root = fromstring(result.text) - match["series"], match["series_index"] = self._parse_series(root=root) - match["tags"] = self._parse_tags(root=root) - match["publisher"] = self._parse_publisher(root=root) - match["publishedDate"] = self._parse_from_summary( - root=root, attribute_name="datePublished" - ) - match["rating"] = self._parse_rating(root=root) - match["description"] = self._parse_description(root=root) - match["cover"] = self._parse_cover(root=root) - match["source"] = { - "id": self.__id__, - "description": self.__name__, - "link": LubimyCzytac.BASE_URL, - } - match['languages'] = self._parse_languages(root=root) - match["identifiers"] = { - "isbn": self._parse_isbn(root=root), - "lubimyczytac": match["id"], - } - return match - - def _parse_cover(self, root): - imgcol_node = root.xpath('//meta[@property="og:image"]/@content') - if imgcol_node: - img_url = imgcol_node[0] - return img_url - - def _parse_publisher(self, root): - publisher = root.xpath(LubimyCzytac.PUBLISHER) - if publisher: - return publisher[0] - else: - return None - - def _parse_languages(self, root): - lang = root.xpath(LubimyCzytac.LANGUAGES) - languages = list() - if lang: - lang = lang[0].strip() - if "polski" in lang: - languages.append("Polish") - if "angielski" in lang: - languages.append("English") - if not languages: - return ['Polish'] - return languages - - def _parse_series(self, root): - try: - series_node = root.xpath(LubimyCzytac.SERIES) - if series_node: - series_lst = root.xpath(f"{LubimyCzytac.SERIES}/text()") - if series_lst: - series_txt = series_lst - else: - series_txt = None - else: - return (None, None) - - if series_txt: - ser_string = [series_txt[0].replace("\n", "").strip()] - ser_nazwa = ser_string - for ser in ser_string: - if "tom " in ser: - ser_info = ser.split(" (tom ", 1) - ser_nazwa = ser.split(" (tom ")[0] - break - - if ser_info: - series_index_unicode = ser_info[1] - series_index_string = str( - series_index_unicode.replace(" ", "").replace(")", "") - ) - # Sprawdzamy, czy cykl nie jest kompletem/pakietem tomów, np. 1-3 - if "-" in series_index_string: - series_index_string_temp = series_index_string.split("-", 1) - series_index_string = series_index_string_temp[0] - if series_index_string.replace(".", "").isdigit() is True: - series_index = get_int_or_float(series_index_string) - else: - series_index = 0 - else: - series_index = 0 - series = ser_nazwa - return (series, series_index) - except: - return (None, None) - - def _parse_tags(self, root): - tags = None - try: - tags_from_genre = root.xpath(LubimyCzytac.TAGS) - if tags_from_genre: - tags = tags_from_genre - tags = [w.replace(", itd.", " itd.") for w in tags] - return tags - else: - return None - except: - return tags - - def _parse_from_summary(self, root, attribute_name: str) -> str: - data = json.loads(root.xpath(LubimyCzytac.SUMMARY)[0]) - value = data.get(attribute_name) - return value.strip() if value is not None else value - - def _parse_rating(self, root): - rating_node = root.xpath(LubimyCzytac.RATING) - if rating_node: - rating_value = round(float((rating_node[0]).replace(",", ".")) / 2) - return rating_value - return None - - def _parse_date(self, root, xpath="first_publish"): - options = { - "first_publish": LubimyCzytac.FIRST_PUBLISH_DATE, - "first_publish_pl": LubimyCzytac.FIRST_PUBLISH_DATE_PL, - } - path = options.get(xpath) - from dateutil import parser - - data = root.xpath(path) - if data: - first_pub_date = data[0].strip() - return parser.parse(first_pub_date) - return None - - def _parse_isbn(self, root): - isbn_node = root.xpath('//meta[@property="books:isbn"]/@content')[0] - return isbn_node - - def _parse_description(self, root): - comments = "" - description_node = root.xpath(LubimyCzytac.DESCRIPTION) - if description_node: - for zrodla in root.xpath('//p[@class="source"]'): - zrodla.getparent().remove(zrodla) - comments = tostring(description_node[0], method="html") - comments = sanitize_comments_html(comments) - - else: - # try - description_node = root.xpath('//meta[@property="og:description"]/@content') - if description_node: - comments = description_node[0] - comments = sanitize_comments_html(comments) - - pages = self._parse_from_summary(root=root, attribute_name="numberOfPages") - if pages: - comments += f'

Książka ma {pages} stron(y).

' - - first_publish_date = self._parse_date(root=root) - if first_publish_date: - comments += f'

Data pierwszego wydania: {first_publish_date.strftime("%d.%m.%Y")}

' - - first_publish_date_pl = self._parse_date(root=root, xpath="first_publish_pl") - if first_publish_date_pl: - comments += f'

Data pierwszego wydania w Polsce: {first_publish_date_pl.strftime("%d.%m.%Y")}

' - - return comments - - def get_title_tokens(self, title, strip_joiners=True, strip_subtitle=False): + @staticmethod + def get_title_tokens( + title: str, strip_joiners: bool = True + ) -> Generator[str, None, None]: """ - Taken from https://github.com/kovidgoyal/calibre/blob/master/src/calibre/ebooks/metadata/sources/base.py. + Taken from calibre source code """ - # strip sub-titles - if strip_subtitle: - subtitle = re.compile(r"([\(\[\{].*?[\)\]\}]|[/:\\].*$)") - if len(subtitle.sub("", title)) > 1: - title = subtitle.sub("", title) - title_patterns = [ (re.compile(pat, re.IGNORECASE), repl) for pat, repl in [ # Remove things like: (2010) (Omnibus) etc. ( - r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|audiobook|audio\scd|paperback|turtleback|mass\s*market|edition|ed\.)[\])}]", + r"(?i)[({\[](\d{4}|omnibus|anthology|hardcover|" + r"audiobook|audio\scd|paperback|turtleback|" + r"mass\s*market|edition|ed\.)[\])}]", "", ), # Remove any strings that contain the substring edition inside @@ -371,3 +185,193 @@ class LubimyCzytac(Metadata): not strip_joiners or token.lower() not in ("a", "and", "the", "&") ): yield token + + +class LubimyCzytacParser: + PAGES_TEMPLATE = "

Książka ma {0} stron(y).

" + PUBLISH_DATE_TEMPLATE = "

Data pierwszego wydania: {0}

" + PUBLISH_DATE_PL_TEMPLATE = ( + "

Data pierwszego wydania w Polsce: {0}

" + ) + + def __init__(self, root: HtmlElement, metadata: Metadata) -> None: + self.root = root + self.metadata = metadata + + def parse_search_results(self) -> List[Dict]: + matches = [] + results = self.root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH) + for result in results: + title = self._parse_xpath_node( + root=result, + xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" + f"{LubimyCzytac.TITLE_TEXT_PATH}", + ) + + book_url = self._parse_xpath_node( + root=result, + xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" + f"{LubimyCzytac.URL_PATH}", + ) + authors = self._parse_xpath_node( + root=result, + xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" + f"{LubimyCzytac.AUTHORS_PATH}", + take_first=False, + ) + if not all([title, book_url, authors]): + continue + matches.append( + { + "id": book_url.replace(f"/ksiazka/", "").split("/")[0], + "title": title, + "authors": [strip_accents(author) for author in authors], + "url": LubimyCzytac.BASE_URL + book_url, + } + ) + return matches + + def parse_single_book( + self, match: Dict, response, generic_cover: str + ) -> MetaRecord: + self.root = fromstring(response.text) + match["series"], match["series_index"] = self._parse_series() + match["tags"] = self._parse_tags() + match["publisher"] = self._parse_publisher() + match["publishedDate"] = self._parse_from_summary( + attribute_name="datePublished" + ) + match["rating"] = self._parse_rating() + match["description"] = self._parse_description() + match["cover"] = self._parse_cover(generic_cover=generic_cover) + match["source"] = { + "id": self.metadata.__id__, + "description": self.metadata.__name__, + "link": LubimyCzytac.BASE_URL, + } + match["languages"] = self._parse_languages() + match["identifiers"] = { + "isbn": self._parse_isbn(), + "lubimyczytac": match["id"], + } + return match + + def _parse_xpath_node( + self, + xpath: str, + root: HtmlElement = None, + take_first: bool = True, + strip_element: bool = True, + ) -> Optional[Union[str, List[str]]]: + root = root if root is not None else self.root + node = root.xpath(xpath) + if not node: + return None + return ( + (node[0].strip() if strip_element else node[0]) + if take_first + else [x.strip() for x in node] + ) + + def _parse_cover(self, generic_cover) -> Optional[str]: + return ( + self._parse_xpath_node(xpath=LubimyCzytac.COVER, take_first=True) + or generic_cover + ) + + def _parse_publisher(self) -> Optional[str]: + return self._parse_xpath_node(xpath=LubimyCzytac.PUBLISHER, take_first=True) + + def _parse_languages(self) -> List[str]: + languages = list() + lang = self._parse_xpath_node(xpath=LubimyCzytac.LANGUAGES, take_first=True) + if lang: + if "polski" in lang: + languages.append("Polish") + if "angielski" in lang: + languages.append("English") + return languages + + def _parse_series(self) -> Tuple[Optional[str], Optional[Union[float, int]]]: + series_index = 0 + series = self._parse_xpath_node(xpath=LubimyCzytac.SERIES, take_first=True) + if series: + if "tom " in series: + series_name, series_info = series.split(" (tom ", 1) + series_info = series_info.replace(" ", "").replace(")", "") + # Check if book is not a bundle, i.e. chapter 1-3 + if "-" in series_info: + series_info = series_info.split("-", 1)[0] + if series_info.replace(".", "").isdigit() is True: + series_index = get_int_or_float(series_info) + return series_name, series_index + return None, None + + def _parse_tags(self) -> List[str]: + tags = self._parse_xpath_node(xpath=LubimyCzytac.TAGS, take_first=False) + return [ + strip_accents(w.replace(", itd.", " itd.")) + for w in tags + if isinstance(w, str) + ] + + def _parse_from_summary(self, attribute_name: str) -> Optional[str]: + value = None + summary_text = self._parse_xpath_node(xpath=LubimyCzytac.SUMMARY) + if summary_text: + data = json.loads(summary_text) + value = data.get(attribute_name) + return value.strip() if value is not None else value + + def _parse_rating(self) -> Optional[str]: + rating = self._parse_xpath_node(xpath=LubimyCzytac.RATING) + return round(float(rating.replace(",", ".")) / 2) if rating else rating + + def _parse_date(self, xpath="first_publish") -> Optional[datetime.datetime]: + options = { + "first_publish": LubimyCzytac.FIRST_PUBLISH_DATE, + "first_publish_pl": LubimyCzytac.FIRST_PUBLISH_DATE_PL, + } + date = self._parse_xpath_node(xpath=options.get(xpath)) + return parser.parse(date) if date else None + + def _parse_isbn(self) -> Optional[str]: + return self._parse_xpath_node(xpath=LubimyCzytac.ISBN) + + def _parse_description(self) -> str: + description = "" + description_node = self._parse_xpath_node( + xpath=LubimyCzytac.DESCRIPTION, strip_element=False + ) + if description_node is not None: + for source in self.root.xpath('//p[@class="source"]'): + source.getparent().remove(source) + description = tostring(description_node, method="html") + description = sanitize_comments_html(description) + + else: + description_node = self._parse_xpath_node(xpath=LubimyCzytac.META_TITLE) + if description_node is not None: + description = description_node + description = sanitize_comments_html(description) + description = self._add_extra_info_to_description(description=description) + return description + + def _add_extra_info_to_description(self, description: str) -> str: + pages = self._parse_from_summary(attribute_name="numberOfPages") + if pages: + description += LubimyCzytacParser.PAGES_TEMPLATE.format(pages) + + first_publish_date = self._parse_date() + if first_publish_date: + description += LubimyCzytacParser.PUBLISH_DATE_TEMPLATE.format( + first_publish_date.strftime("%d.%m.%Y") + ) + + first_publish_date_pl = self._parse_date(xpath="first_publish_pl") + if first_publish_date_pl: + description += LubimyCzytacParser.PUBLISH_DATE_PL_TEMPLATE.format( + first_publish_date_pl.strftime("%d.%m.%Y") + ) + + return description diff --git a/cps/services/Metadata.py b/cps/services/Metadata.py index d6e4e7d5..17a9e38e 100644 --- a/cps/services/Metadata.py +++ b/cps/services/Metadata.py @@ -15,13 +15,44 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import abc +from typing import Dict, List, Optional, TypedDict, Union -class Metadata(): +class Metadata: __name__ = "Generic" + __id__ = "generic" def __init__(self): self.active = True def set_status(self, state): self.active = state + + @abc.abstractmethod + def search(self, query: str, generic_cover: str): + pass + + +class MetaSourceInfo(TypedDict): + id: str + description: str + link: str + + +class MetaRecord(TypedDict): + id: Union[str, int] + title: str + authors: List[str] + url: str + cover: str + series: Optional[str] + series_index: Optional[Union[int, float]] + tags: Optional[List[str]] + publisher: Optional[str] + publishedDate: Optional[str] + rating: Optional[int] + description: Optional[str] + source: MetaSourceInfo + languages: Optional[List[str]] + identifiers: Dict[str, Union[str, int]] diff --git a/requirements.txt b/requirements.txt index d1f58a8d..d09c2157 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,3 +16,4 @@ lxml>=3.8.0,<4.7.0 flask-wtf>=0.14.2,<1.1.0 markdown2==2.4.2 html2text==2020.1.16 +python-dateutil==2.8.2