From 51bf35c2e41a16032e1250a8cac252195116a147 Mon Sep 17 00:00:00 2001 From: collerek Date: Mon, 13 Dec 2021 17:21:41 +0100 Subject: [PATCH] unify scholar --- cps/metadata_provider/comicvine.py | 87 ++++++++++++++++----------- cps/metadata_provider/google.py | 49 ++++++++------- cps/metadata_provider/lubimyczytac.py | 57 +++++++++--------- cps/metadata_provider/scholar.py | 66 +++++++++++--------- cps/search_metadata.py | 15 +++-- cps/services/Metadata.py | 32 ++++++---- optional-requirements.txt | 3 + requirements.txt | 3 - 8 files changed, 172 insertions(+), 140 deletions(-) diff --git a/cps/metadata_provider/comicvine.py b/cps/metadata_provider/comicvine.py index 195e68f8..56618d4b 100644 --- a/cps/metadata_provider/comicvine.py +++ b/cps/metadata_provider/comicvine.py @@ -17,49 +17,68 @@ # along with this program. If not, see . # ComicVine api document: https://comicvine.gamespot.com/api/documentation +from typing import Dict, List, Optional +from urllib.parse import quote import requests -from cps.services.Metadata import Metadata +from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata class ComicVine(Metadata): __name__ = "ComicVine" __id__ = "comicvine" + DESCRIPTION = "ComicVine Books" + META_URL = "https://comicvine.gamespot.com/" + API_KEY = "57558043c53943d5d1e96a9ad425b0eb85532ee6" + BASE_URL = ( + f"https://comicvine.gamespot.com/api/search?api_key={API_KEY}" + f"&resources=issue&query=" + ) + QUERY_PARAMS = "&sort=name:desc&format=json" + HEADERS = {"User-Agent": "Not Evil Browser"} - def search(self, query, generic_cover=""): + def search( + self, query: str, generic_cover: str = "", locale: str = "en" + ) -> Optional[List[MetaRecord]]: val = list() - apikey = "57558043c53943d5d1e96a9ad425b0eb85532ee6" if self.active: - headers = { - 'User-Agent': 'Not Evil Browser' - } - - result = requests.get("https://comicvine.gamespot.com/api/search?api_key=" - + apikey + "&resources=issue&query=" + query + "&sort=name:desc&format=json", headers=headers) - for r in result.json()['results']: - seriesTitle = r['volume'].get('name', "") - if r.get('store_date'): - dateFomers = r.get('store_date') - else: - dateFomers = r.get('date_added') - v = dict() - v['id'] = r['id'] - v['title'] = seriesTitle + " #" + r.get('issue_number', "0") + " - " + ( r.get('name', "") or "") - v['authors'] = r.get('authors', []) - v['description'] = r.get('description', "") - v['publisher'] = "" - v['publishedDate'] = dateFomers - v['tags'] = ["Comics", seriesTitle] - v['rating'] = 0 - v['series'] = seriesTitle - v['cover'] = r['image'].get('original_url', generic_cover) - v['source'] = { - "id": self.__id__, - "description": "ComicVine Books", - "link": "https://comicvine.gamespot.com/" - } - v['url'] = r.get('site_detail_url', "") - val.append(v) + title_tokens = list(self.get_title_tokens(query, strip_joiners=False)) + if title_tokens: + tokens = [quote(t.encode("utf-8")) for t in title_tokens] + query = "%20".join(tokens) + result = requests.get( + f"{ComicVine.BASE_URL}{query}{ComicVine.QUERY_PARAMS}", + headers=ComicVine.HEADERS, + ) + for result in result.json()["results"]: + match = self._parse_search_result( + result=result, generic_cover=generic_cover, locale=locale + ) + val.append(match) return val - + def _parse_search_result( + self, result: Dict, generic_cover: str, locale: str + ) -> MetaRecord: + series = result["volume"].get("name", "") + series_index = result.get("issue_number", 0) + issue_name = result.get("name", "") + match = MetaRecord( + id=result["id"], + title=f"{series}#{series_index} - {issue_name}", + authors=result.get("authors", []), + url=result.get("site_detail_url", ""), + source=MetaSourceInfo( + id=self.__id__, + description=ComicVine.DESCRIPTION, + link=ComicVine.META_URL, + ), + series=series, + ) + match.cover = result["image"].get("original_url", generic_cover) + match.description = result.get("description", "") + match.publishedDate = result.get("store_date", result.get("date_added")) + match.series_index = series_index + match.tags = ["Comics", series] + match.identifiers = {"comicvine": match.id} + return match diff --git a/cps/metadata_provider/google.py b/cps/metadata_provider/google.py index 1074fe3d..5ac3e7ee 100644 --- a/cps/metadata_provider/google.py +++ b/cps/metadata_provider/google.py @@ -23,7 +23,7 @@ from urllib.parse import quote import requests from cps.isoLanguages import get_lang3, get_language_name -from cps.services.Metadata import MetaRecord, Metadata +from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata class Google(Metadata): @@ -56,38 +56,37 @@ class Google(Metadata): def _parse_search_result( self, result: Dict, generic_cover: str, locale: str ) -> MetaRecord: - match = dict() - match["id"] = result["id"] - match["title"] = result["volumeInfo"]["title"] - match["authors"] = result["volumeInfo"].get("authors", []) - match["url"] = Google.BOOK_URL + result["id"] - match["cover"] = self._parse_cover(result=result, generic_cover=generic_cover) - match["description"] = result["volumeInfo"].get("description", "") - match["languages"] = self._parse_languages(result=result, locale=locale) - match["publisher"] = result["volumeInfo"].get("publisher", "") - match["publishedDate"] = result["volumeInfo"].get("publishedDate", "") - match["rating"] = result["volumeInfo"].get("averageRating", 0) - match["series"], match["series_index"] = "", 1 - match["tags"] = result["volumeInfo"].get("categories", []) + match = MetaRecord( + id=result["id"], + title=result["volumeInfo"]["title"], + authors=result["volumeInfo"].get("authors", []), + url=Google.BOOK_URL + result["id"], + source=MetaSourceInfo( + id=self.__id__, + description=Google.DESCRIPTION, + link=Google.META_URL, + ), + ) - match["source"] = { - "id": self.__id__, - "description": Google.DESCRIPTION, - "link": Google.META_URL, - } + match.cover = self._parse_cover(result=result, generic_cover=generic_cover) + match.description = result["volumeInfo"].get("description", "") + match.languages = self._parse_languages(result=result, locale=locale) + match.publisher = result["volumeInfo"].get("publisher", "") + match.publishedDate = result["volumeInfo"].get("publishedDate", "") + match.rating = result["volumeInfo"].get("averageRating", 0) + match.series, match.series_index = "", 1 + match.tags = result["volumeInfo"].get("categories", []) - match["identifiers"] = { - "google": match.get("id"), - } + match.identifiers = {"google": match.id} match = self._parse_isbn(result=result, match=match) return match @staticmethod - def _parse_isbn(result: Dict, match: Dict) -> Dict: + def _parse_isbn(result: Dict, match: MetaRecord) -> MetaRecord: identifiers = result["volumeInfo"].get("industryIdentifiers", []) for identifier in identifiers: if identifier.get("type") == Google.ISBN_TYPE: - match["identifiers"]["isbn"] = identifier.get("identifier") + match.identifiers["isbn"] = identifier.get("identifier") break return match @@ -100,7 +99,7 @@ class Google(Metadata): @staticmethod def _parse_languages(result: Dict, locale: str) -> List[str]: - language_iso2 = result.get("language", "") + language_iso2 = result["volumeInfo"].get("language", "") languages = ( [get_language_name(locale, get_lang3(language_iso2))] if language_iso2 diff --git a/cps/metadata_provider/lubimyczytac.py b/cps/metadata_provider/lubimyczytac.py index fd9ca4a7..4f6aca1e 100644 --- a/cps/metadata_provider/lubimyczytac.py +++ b/cps/metadata_provider/lubimyczytac.py @@ -27,7 +27,7 @@ from html2text import HTML2Text from lxml.html import HtmlElement, fromstring, tostring from markdown2 import Markdown -from cps.services.Metadata import MetaRecord, Metadata +from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata SYMBOLS_TO_TRANSLATE = ( "öÖüÜóÓőŐúÚéÉáÁűŰíÍąĄćĆęĘłŁńŃóÓśŚźŹżŻ", @@ -158,61 +158,60 @@ class LubimyCzytacParser: self.root = root self.metadata = metadata - def parse_search_results(self) -> List[Dict]: + def parse_search_results(self) -> List[MetaRecord]: matches = [] results = self.root.xpath(LubimyCzytac.BOOK_SEARCH_RESULT_XPATH) for result in results: title = self._parse_xpath_node( root=result, xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" - f"{LubimyCzytac.TITLE_TEXT_PATH}", + f"{LubimyCzytac.TITLE_TEXT_PATH}", ) book_url = self._parse_xpath_node( root=result, xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" - f"{LubimyCzytac.URL_PATH}", + f"{LubimyCzytac.URL_PATH}", ) authors = self._parse_xpath_node( root=result, xpath=f"{LubimyCzytac.SINGLE_BOOK_RESULT_XPATH}" - f"{LubimyCzytac.AUTHORS_PATH}", + f"{LubimyCzytac.AUTHORS_PATH}", take_first=False, ) if not all([title, book_url, authors]): continue matches.append( - { - "id": book_url.replace(f"/ksiazka/", "").split("/")[0], - "title": title, - "authors": [strip_accents(author) for author in authors], - "url": LubimyCzytac.BASE_URL + book_url, - } + MetaRecord( + id=book_url.replace(f"/ksiazka/", "").split("/")[0], + title=title, + authors=[strip_accents(author) for author in authors], + url=LubimyCzytac.BASE_URL + book_url, + source=MetaSourceInfo( + id=self.metadata.__id__, + description=self.metadata.__name__, + link=LubimyCzytac.BASE_URL, + ) + ) ) return matches - def parse_single_book(self, match: Dict, generic_cover: str) -> MetaRecord: - response = requests.get(match.get("url")) + def parse_single_book(self, match: MetaRecord, generic_cover: str) -> MetaRecord: + response = requests.get(match.url) self.root = fromstring(response.text) - match["cover"] = self._parse_cover(generic_cover=generic_cover) - match["description"] = self._parse_description() - match["languages"] = self._parse_languages() - match["publisher"] = self._parse_publisher() - match["publishedDate"] = self._parse_from_summary( + match.cover = self._parse_cover(generic_cover=generic_cover) + match.description = self._parse_description() + match.languages = self._parse_languages() + match.publisher = self._parse_publisher() + match.publishedDate = self._parse_from_summary( attribute_name="datePublished" ) - match["rating"] = self._parse_rating() - match["series"], match["series_index"] = self._parse_series() - match["tags"] = self._parse_tags() - - match["source"] = { - "id": self.metadata.__id__, - "description": self.metadata.__name__, - "link": LubimyCzytac.BASE_URL, - } - match["identifiers"] = { + match.rating = self._parse_rating() + match.series, match.series_index = self._parse_series() + match.tags = self._parse_tags() + match.identifiers = { "isbn": self._parse_isbn(), - "lubimyczytac": match["id"], + "lubimyczytac": match.id, } return match diff --git a/cps/metadata_provider/scholar.py b/cps/metadata_provider/scholar.py index 6e13c768..0becaef0 100644 --- a/cps/metadata_provider/scholar.py +++ b/cps/metadata_provider/scholar.py @@ -15,47 +15,53 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see . +import itertools +from typing import Dict, List, Optional +from urllib.parse import quote from scholarly import scholarly -from cps.services.Metadata import Metadata +from cps.services.Metadata import MetaRecord, MetaSourceInfo, Metadata class scholar(Metadata): __name__ = "Google Scholar" __id__ = "googlescholar" + META_URL = "https://scholar.google.com/" - def search(self, query, generic_cover=""): + def search( + self, query: str, generic_cover: str = "", locale: str = "en" + ) -> Optional[List[MetaRecord]]: val = list() if self.active: - scholar_gen = scholarly.search_pubs(' '.join(query.split('+'))) - i = 0 - for publication in scholar_gen: - v = dict() - v['id'] = "1234" # publication['bib'].get('title') - v['title'] = publication['bib'].get('title') - v['authors'] = publication['bib'].get('author', []) - v['description'] = publication['bib'].get('abstract', "") - v['publisher'] = publication['bib'].get('venue', "") - if publication['bib'].get('pub_year'): - v['publishedDate'] = publication['bib'].get('pub_year')+"-01-01" - else: - v['publishedDate'] = "" - v['tags'] = "" - v['ratings'] = 0 - v['series'] = "" - v['cover'] = generic_cover - v['url'] = publication.get('pub_url') or publication.get('eprint_url') or "", - v['source'] = { - "id": self.__id__, - "description": "Google Scholar", - "link": "https://scholar.google.com/" - } - val.append(v) - i += 1 - if (i >= 10): - break + title_tokens = list(self.get_title_tokens(query, strip_joiners=False)) + if title_tokens: + tokens = [quote(t.encode("utf-8")) for t in title_tokens] + query = " ".join(tokens) + scholar_gen = itertools.islice(scholarly.search_pubs(query), 10) + for result in scholar_gen: + match = self._parse_search_result( + result=result, generic_cover=generic_cover, locale=locale + ) + val.append(match) return val + def _parse_search_result( + self, result: Dict, generic_cover: str, locale: str + ) -> MetaRecord: + match = MetaRecord( + id=result.get("pub_url", result.get("eprint_url", "")), + title=result["bib"].get("title"), + authors=result["bib"].get("author", []), + url=result.get("pub_url", result.get("eprint_url", "")), + source=MetaSourceInfo( + id=self.__id__, description=self.__name__, link=scholar.META_URL + ), + ) - + match.cover = result.get("image", {}).get("original_url", generic_cover) + match.description = result["bib"].get("abstract", "") + match.publisher = result["bib"].get("venue", "") + match.publishedDate = result["bib"].get("pub_year") + "-01-01" + match.identifiers = {"scholar": match.id} + return match diff --git a/cps/search_metadata.py b/cps/search_metadata.py index a128f9ac..53cbf553 100644 --- a/cps/search_metadata.py +++ b/cps/search_metadata.py @@ -22,6 +22,7 @@ import inspect import json import os import sys +from dataclasses import asdict from flask import Blueprint, Response, request, url_for from flask_login import current_user @@ -99,11 +100,13 @@ def metadata_change_active_provider(prov_name): log.error("Invalid request received: {}".format(request)) return "Invalid request", 400 if "initial" in new_state and prov_name: - for c in cl: - if c.__id__ == prov_name: - data = c.search(new_state.get("query", "")) - break - return Response(json.dumps(data), mimetype="application/json") + data = [] + provider = next((c for c in cl if c.__id__ == prov_name), None) + if provider is not None: + data = provider.search(new_state.get("query", "")) + return Response( + json.dumps([asdict(x) for x in data]), mimetype="application/json" + ) return "" @@ -123,5 +126,5 @@ def metadata_search(): if active.get(c.__id__, True) } for future in concurrent.futures.as_completed(meta): - data.extend(future.result()) + data.extend([asdict(x) for x in future.result()]) return Response(json.dumps(data), mimetype="application/json") diff --git a/cps/services/Metadata.py b/cps/services/Metadata.py index 09fc70ce..f4a5662c 100644 --- a/cps/services/Metadata.py +++ b/cps/services/Metadata.py @@ -16,32 +16,38 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . import abc +import dataclasses +import os import re -from typing import Dict, Generator, List, Optional, TypedDict, Union +from typing import Dict, Generator, List, Optional, Union + +from cps import constants -class MetaSourceInfo(TypedDict): +@dataclasses.dataclass +class MetaSourceInfo: id: str description: str link: str -class MetaRecord(TypedDict): +@dataclasses.dataclass +class MetaRecord: id: Union[str, int] title: str authors: List[str] url: str - cover: str - series: Optional[str] - series_index: Optional[Union[int, float]] - tags: Optional[List[str]] - publisher: Optional[str] - publishedDate: Optional[str] - rating: Optional[int] - description: Optional[str] source: MetaSourceInfo - languages: Optional[List[str]] - identifiers: Dict[str, Union[str, int]] + cover: str = os.path.join(constants.STATIC_DIR, 'generic_cover.jpg') + description: Optional[str] = "" + series: Optional[str] = None + series_index: Optional[Union[int, float]] = 0 + identifiers: Dict[str, Union[str, int]] = dataclasses.field(default_factory=dict) + publisher: Optional[str] = None + publishedDate: Optional[str] = None + rating: Optional[int] = 0 + languages: Optional[List[str]] = dataclasses.field(default_factory=list) + tags: Optional[List[str]] = dataclasses.field(default_factory=list) class Metadata: diff --git a/optional-requirements.txt b/optional-requirements.txt index 03f58bb5..17c4b878 100644 --- a/optional-requirements.txt +++ b/optional-requirements.txt @@ -32,6 +32,9 @@ SQLAlchemy-Utils>=0.33.5,<0.38.0 # extracting metadata rarfile>=2.7 scholarly>=1.2.0, <1.5 +markdown2==2.4.2 +html2text==2020.1.16 +python-dateutil==2.8.2 # other natsort>=2.2.0,<8.1.0 diff --git a/requirements.txt b/requirements.txt index d09c2157..1db961fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -14,6 +14,3 @@ Wand>=0.4.4,<0.7.0 unidecode>=0.04.19,<1.3.0 lxml>=3.8.0,<4.7.0 flask-wtf>=0.14.2,<1.1.0 -markdown2==2.4.2 -html2text==2020.1.16 -python-dateutil==2.8.2