From a6af5b12d23650bb8942a562560f3ab8b535271e Mon Sep 17 00:00:00 2001 From: ibu Date: Mon, 29 Nov 2021 09:16:31 +0000 Subject: [PATCH] Put under version control --- .gitignore | 51 + .pre-commit-config.yaml | 30 + Pipfile | 46 + Pipfile.lock | 1561 +++ README.md | 13 + doc/Makefile | 20 + doc/source/conf.py | 71 + .../initial_data/seed_urls.list | 23 + doc/source/config_template/main.yaml | 88 + .../config_template/plugins/__init__.py | 0 .../plugins/filter_resource_path.py | 22 + .../config_template/plugins/filter_site.py | 47 + .../plugins/filter_site_path.py | 24 + doc/source/devel/devel.md | 63 + doc/source/devel/related_work.md | 64 + doc/source/devel/todo.md | 77 + doc/source/development.rst | 9 + doc/source/elasticsearch.md | 119 + doc/source/index.rst | 37 + doc/source/installation.md | 122 + doc/source/introduction.md | 66 + doc/source/maintenance.md | 23 + doc/source/tensorflow_model_server.md | 98 + license.txt | 48 + pyproject.toml | 10 + src/atextcrawler/__init__.py | 0 src/atextcrawler/__main__.py | 12 + src/atextcrawler/application.py | 204 + src/atextcrawler/assets/iana_langs | 7 + src/atextcrawler/assets/iso_639-1 | 219 + src/atextcrawler/assets/top_1e4 | 10000 ++++++++++++++++ src/atextcrawler/config.py | 337 + src/atextcrawler/crawl.py | 215 + src/atextcrawler/db.py | 162 + src/atextcrawler/migrations/1.sql | 297 + src/atextcrawler/models.py | 610 + src/atextcrawler/plugin_defaults/__init__.py | 0 .../plugin_defaults/filter_resource_path.py | 22 + .../plugin_defaults/filter_site.py | 47 + .../plugin_defaults/filter_site_path.py | 24 + src/atextcrawler/resource/__init__.py | 10 + src/atextcrawler/resource/__main__.py | 96 + src/atextcrawler/resource/dedup.py | 59 + src/atextcrawler/resource/document.py | 131 + src/atextcrawler/resource/feed.py | 155 + src/atextcrawler/resource/fetch.py | 327 + src/atextcrawler/resource/operations.py | 347 + src/atextcrawler/resource/page.py | 355 + src/atextcrawler/resource/plaintext.py | 148 + src/atextcrawler/resource/sitemap.py | 149 + src/atextcrawler/search/__init__.py | 6 + src/atextcrawler/search/engine.py | 270 + src/atextcrawler/site/__init__.py | 9 + src/atextcrawler/site/__main__.py | 68 + src/atextcrawler/site/feeds.py | 100 + src/atextcrawler/site/operations.py | 267 + src/atextcrawler/site/parse.py | 255 + src/atextcrawler/site/queue.py | 127 + src/atextcrawler/site/robots.py | 98 + src/atextcrawler/site/seed.py | 72 + src/atextcrawler/tensorflow.py | 69 + src/atextcrawler/utils/__init__.py | 0 src/atextcrawler/utils/annotation.py | 481 + src/atextcrawler/utils/date_finder.py | 90 + src/atextcrawler/utils/durl.py | 278 + src/atextcrawler/utils/html.py | 136 + src/atextcrawler/utils/http.py | 58 + src/atextcrawler/utils/json.py | 32 + src/atextcrawler/utils/lang.py | 44 + src/atextcrawler/utils/link.py | 116 + src/atextcrawler/utils/muse.py | 120 + src/atextcrawler/utils/probe.py | 22 + src/atextcrawler/utils/section.py | 74 + src/atextcrawler/utils/similarity.py | 92 + src/atextcrawler/utils/tag.py | 189 + tests/__init__.py | 7 + tests/annotation.py | 49 + tests/date_finder.py | 20 + tests/durl.py | 68 + tests/page.py | 24 + tests/section.py | 105 + tests/simhash.py | 54 + tests/text.py | 65 + 83 files changed, 20130 insertions(+) create mode 100644 .gitignore create mode 100644 .pre-commit-config.yaml create mode 100644 Pipfile create mode 100644 Pipfile.lock create mode 100644 README.md create mode 100644 doc/Makefile create mode 100644 doc/source/conf.py create mode 100644 doc/source/config_template/initial_data/seed_urls.list create mode 100644 doc/source/config_template/main.yaml create mode 100644 doc/source/config_template/plugins/__init__.py create mode 100644 doc/source/config_template/plugins/filter_resource_path.py create mode 100644 doc/source/config_template/plugins/filter_site.py create mode 100644 doc/source/config_template/plugins/filter_site_path.py create mode 100644 doc/source/devel/devel.md create mode 100644 doc/source/devel/related_work.md create mode 100644 doc/source/devel/todo.md create mode 100644 doc/source/development.rst create mode 100644 doc/source/elasticsearch.md create mode 100644 doc/source/index.rst create mode 100644 doc/source/installation.md create mode 100644 doc/source/introduction.md create mode 100644 doc/source/maintenance.md create mode 100644 doc/source/tensorflow_model_server.md create mode 100644 license.txt create mode 100644 pyproject.toml create mode 100644 src/atextcrawler/__init__.py create mode 100644 src/atextcrawler/__main__.py create mode 100644 src/atextcrawler/application.py create mode 100644 src/atextcrawler/assets/iana_langs create mode 100644 src/atextcrawler/assets/iso_639-1 create mode 100644 src/atextcrawler/assets/top_1e4 create mode 100644 src/atextcrawler/config.py create mode 100644 src/atextcrawler/crawl.py create mode 100644 src/atextcrawler/db.py create mode 100644 src/atextcrawler/migrations/1.sql create mode 100644 src/atextcrawler/models.py create mode 100644 src/atextcrawler/plugin_defaults/__init__.py create mode 100644 src/atextcrawler/plugin_defaults/filter_resource_path.py create mode 100644 src/atextcrawler/plugin_defaults/filter_site.py create mode 100644 src/atextcrawler/plugin_defaults/filter_site_path.py create mode 100644 src/atextcrawler/resource/__init__.py create mode 100644 src/atextcrawler/resource/__main__.py create mode 100644 src/atextcrawler/resource/dedup.py create mode 100644 src/atextcrawler/resource/document.py create mode 100644 src/atextcrawler/resource/feed.py create mode 100644 src/atextcrawler/resource/fetch.py create mode 100644 src/atextcrawler/resource/operations.py create mode 100644 src/atextcrawler/resource/page.py create mode 100644 src/atextcrawler/resource/plaintext.py create mode 100644 src/atextcrawler/resource/sitemap.py create mode 100644 src/atextcrawler/search/__init__.py create mode 100644 src/atextcrawler/search/engine.py create mode 100644 src/atextcrawler/site/__init__.py create mode 100644 src/atextcrawler/site/__main__.py create mode 100644 src/atextcrawler/site/feeds.py create mode 100644 src/atextcrawler/site/operations.py create mode 100644 src/atextcrawler/site/parse.py create mode 100644 src/atextcrawler/site/queue.py create mode 100644 src/atextcrawler/site/robots.py create mode 100644 src/atextcrawler/site/seed.py create mode 100644 src/atextcrawler/tensorflow.py create mode 100644 src/atextcrawler/utils/__init__.py create mode 100644 src/atextcrawler/utils/annotation.py create mode 100644 src/atextcrawler/utils/date_finder.py create mode 100644 src/atextcrawler/utils/durl.py create mode 100644 src/atextcrawler/utils/html.py create mode 100644 src/atextcrawler/utils/http.py create mode 100644 src/atextcrawler/utils/json.py create mode 100644 src/atextcrawler/utils/lang.py create mode 100644 src/atextcrawler/utils/link.py create mode 100644 src/atextcrawler/utils/muse.py create mode 100644 src/atextcrawler/utils/probe.py create mode 100644 src/atextcrawler/utils/section.py create mode 100644 src/atextcrawler/utils/similarity.py create mode 100644 src/atextcrawler/utils/tag.py create mode 100644 tests/__init__.py create mode 100644 tests/annotation.py create mode 100644 tests/date_finder.py create mode 100644 tests/durl.py create mode 100644 tests/page.py create mode 100644 tests/section.py create mode 100644 tests/simhash.py create mode 100644 tests/text.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..b9d2de6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,51 @@ +# Backup files +*.~ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] + +# C extensions +*.so + +# Distribution / packaging +bin/ +build/ +develop-eggs/ +dist/ +eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST +NOTES + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +.tox/ +.coverage +.cache +nosetests.xml +coverage.xml +htmlcov + +# Translations +*.mo + +# mypy cache +.mypy_cache + +# Sphinx documentation +doc/build/ +doc/source/reference/ + +# tmp dir +tmp/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..adf1b1a --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,30 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-yaml + - id: check-added-large-files +- repo: https://github.com/psf/black + rev: 21.11b1 + hooks: + - id: black +- repo: https://github.com/timothycrosley/isort + rev: 5.10.1 + hooks: + - id: isort + args: ["--profile", "black", "--filter-files", "-l", "79"] +- repo: https://github.com/myint/autoflake + rev: v1.4 + hooks: + - id: autoflake + args: + [ + "--in-place", + "--remove-all-unused-imports", + "--ignore-init-module-imports", + "--remove-unused-variables", + ] diff --git a/Pipfile b/Pipfile new file mode 100644 index 0000000..eed7f14 --- /dev/null +++ b/Pipfile @@ -0,0 +1,46 @@ +[[source]] +url = "https://pypi.org/simple" +verify_ssl = true +name = "pypi" + +[packages] +aiohttp = "*" +async-lru = "*" +asyncpg = "*" +beautifulsoup4 = "*" +elasticsearch = { version = ">=7.0.0,<8.0.0", extras = ['async'] } +elasticsearch-dsl = { version = ">=7.0.0,<8.0.0" } +feedparser = "*" +gcld3 = "*" +# TODO: recheck +pypandoc = "*" +pytidylib = "*" +pytz = "*" +pyyaml = "*" +tika = "*" +tldextract = "*" +voluptuous = "*" +simhash = "*" +async-dns = "*" +types-pyyaml = "*" +sphinx-rtd-theme = "*" + +[dev-packages] +mypy = "*" +pre-commit = "*" +sphinx = "*" +myst-parser = "*" +isort = "*" +blacken-docs = "*" +pybetter = "*" +interrogate = "*" +autoflake = "*" +types-pyyaml = "*" +types-pytz = "*" +black = "*" + +[requires] +python_version = "3.9" + +[pipenv] +allow_prereleases = true diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..58e2e74 --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,1561 @@ +{ + "_meta": { + "hash": { + "sha256": "df63c76f1b8b031337d671aade6cc91f9add2205a75dbbd2770fa14e9430be55" + }, + "pipfile-spec": 6, + "requires": { + "python_version": "3.9" + }, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "aiohttp": { + "hashes": [ + "sha256:01d7bdb774a9acc838e6b8f1d114f45303841b89b95984cbb7d80ea41172a9e3", + "sha256:03a6d5349c9ee8f79ab3ff3694d6ce1cfc3ced1c9d36200cb8f08ba06bd3b782", + "sha256:04d48b8ce6ab3cf2097b1855e1505181bdd05586ca275f2505514a6e274e8e75", + "sha256:0770e2806a30e744b4e21c9d73b7bee18a1cfa3c47991ee2e5a65b887c49d5cf", + "sha256:07b05cd3305e8a73112103c834e91cd27ce5b4bd07850c4b4dbd1877d3f45be7", + "sha256:086f92daf51a032d062ec5f58af5ca6a44d082c35299c96376a41cbb33034675", + "sha256:099ebd2c37ac74cce10a3527d2b49af80243e2a4fa39e7bce41617fbc35fa3c1", + "sha256:0c7ebbbde809ff4e970824b2b6cb7e4222be6b95a296e46c03cf050878fc1785", + "sha256:102e487eeb82afac440581e5d7f8f44560b36cf0bdd11abc51a46c1cd88914d4", + "sha256:11691cf4dc5b94236ccc609b70fec991234e7ef8d4c02dd0c9668d1e486f5abf", + "sha256:11a67c0d562e07067c4e86bffc1553f2cf5b664d6111c894671b2b8712f3aba5", + "sha256:12de6add4038df8f72fac606dff775791a60f113a725c960f2bab01d8b8e6b15", + "sha256:13487abd2f761d4be7c8ff9080de2671e53fff69711d46de703c310c4c9317ca", + "sha256:15b09b06dae900777833fe7fc4b4aa426556ce95847a3e8d7548e2d19e34edb8", + "sha256:1c182cb873bc91b411e184dab7a2b664d4fea2743df0e4d57402f7f3fa644bac", + "sha256:1ed0b6477896559f17b9eaeb6d38e07f7f9ffe40b9f0f9627ae8b9926ae260a8", + "sha256:28d490af82bc6b7ce53ff31337a18a10498303fe66f701ab65ef27e143c3b0ef", + "sha256:2e5d962cf7e1d426aa0e528a7e198658cdc8aa4fe87f781d039ad75dcd52c516", + "sha256:2ed076098b171573161eb146afcb9129b5ff63308960aeca4b676d9d3c35e700", + "sha256:2f2f69dca064926e79997f45b2f34e202b320fd3782f17a91941f7eb85502ee2", + "sha256:31560d268ff62143e92423ef183680b9829b1b482c011713ae941997921eebc8", + "sha256:31d1e1c0dbf19ebccbfd62eff461518dcb1e307b195e93bba60c965a4dcf1ba0", + "sha256:37951ad2f4a6df6506750a23f7cbabad24c73c65f23f72e95897bb2cecbae676", + "sha256:3af642b43ce56c24d063325dd2cf20ee012d2b9ba4c3c008755a301aaea720ad", + "sha256:44db35a9e15d6fe5c40d74952e803b1d96e964f683b5a78c3cc64eb177878155", + "sha256:473d93d4450880fe278696549f2e7aed8cd23708c3c1997981464475f32137db", + "sha256:477c3ea0ba410b2b56b7efb072c36fa91b1e6fc331761798fa3f28bb224830dd", + "sha256:4a4a4e30bf1edcad13fb0804300557aedd07a92cabc74382fdd0ba6ca2661091", + "sha256:4aed991a28ea3ce320dc8ce655875e1e00a11bdd29fe9444dd4f88c30d558602", + "sha256:51467000f3647d519272392f484126aa716f747859794ac9924a7aafa86cd411", + "sha256:55c3d1072704d27401c92339144d199d9de7b52627f724a949fc7d5fc56d8b93", + "sha256:589c72667a5febd36f1315aa6e5f56dd4aa4862df295cb51c769d16142ddd7cd", + "sha256:5bfde62d1d2641a1f5173b8c8c2d96ceb4854f54a44c23102e2ccc7e02f003ec", + "sha256:5c23b1ad869653bc818e972b7a3a79852d0e494e9ab7e1a701a3decc49c20d51", + "sha256:61bfc23df345d8c9716d03717c2ed5e27374e0fe6f659ea64edcd27b4b044cf7", + "sha256:6ae828d3a003f03ae31915c31fa684b9890ea44c9c989056fea96e3d12a9fa17", + "sha256:6c7cefb4b0640703eb1069835c02486669312bf2f12b48a748e0a7756d0de33d", + "sha256:6d69f36d445c45cda7b3b26afef2fc34ef5ac0cdc75584a87ef307ee3c8c6d00", + "sha256:6f0d5f33feb5f69ddd57a4a4bd3d56c719a141080b445cbf18f238973c5c9923", + "sha256:6f8b01295e26c68b3a1b90efb7a89029110d3a4139270b24fda961893216c440", + "sha256:713ac174a629d39b7c6a3aa757b337599798da4c1157114a314e4e391cd28e32", + "sha256:718626a174e7e467f0558954f94af117b7d4695d48eb980146016afa4b580b2e", + "sha256:7187a76598bdb895af0adbd2fb7474d7f6025d170bc0a1130242da817ce9e7d1", + "sha256:71927042ed6365a09a98a6377501af5c9f0a4d38083652bcd2281a06a5976724", + "sha256:7d08744e9bae2ca9c382581f7dce1273fe3c9bae94ff572c3626e8da5b193c6a", + "sha256:7dadf3c307b31e0e61689cbf9e06be7a867c563d5a63ce9dca578f956609abf8", + "sha256:81e3d8c34c623ca4e36c46524a3530e99c0bc95ed068fd6e9b55cb721d408fb2", + "sha256:844a9b460871ee0a0b0b68a64890dae9c415e513db0f4a7e3cab41a0f2fedf33", + "sha256:8b7ef7cbd4fec9a1e811a5de813311ed4f7ac7d93e0fda233c9b3e1428f7dd7b", + "sha256:97ef77eb6b044134c0b3a96e16abcb05ecce892965a2124c566af0fd60f717e2", + "sha256:99b5eeae8e019e7aad8af8bb314fb908dd2e028b3cdaad87ec05095394cce632", + "sha256:a25fa703a527158aaf10dafd956f7d42ac6d30ec80e9a70846253dd13e2f067b", + "sha256:a2f635ce61a89c5732537a7896b6319a8fcfa23ba09bec36e1b1ac0ab31270d2", + "sha256:a79004bb58748f31ae1cbe9fa891054baaa46fb106c2dc7af9f8e3304dc30316", + "sha256:a996d01ca39b8dfe77440f3cd600825d05841088fd6bc0144cc6c2ec14cc5f74", + "sha256:b0e20cddbd676ab8a64c774fefa0ad787cc506afd844de95da56060348021e96", + "sha256:b6613280ccedf24354406caf785db748bebbddcf31408b20c0b48cb86af76866", + "sha256:b9d00268fcb9f66fbcc7cd9fe423741d90c75ee029a1d15c09b22d23253c0a44", + "sha256:bb01ba6b0d3f6c68b89fce7305080145d4877ad3acaed424bae4d4ee75faa950", + "sha256:c2aef4703f1f2ddc6df17519885dbfa3514929149d3ff900b73f45998f2532fa", + "sha256:c34dc4958b232ef6188c4318cb7b2c2d80521c9a56c52449f8f93ab7bc2a8a1c", + "sha256:c3630c3ef435c0a7c549ba170a0633a56e92629aeed0e707fec832dee313fb7a", + "sha256:c3d6a4d0619e09dcd61021debf7059955c2004fa29f48788a3dfaf9c9901a7cd", + "sha256:d15367ce87c8e9e09b0f989bfd72dc641bcd04ba091c68cd305312d00962addd", + "sha256:d2f9b69293c33aaa53d923032fe227feac867f81682f002ce33ffae978f0a9a9", + "sha256:e999f2d0e12eea01caeecb17b653f3713d758f6dcc770417cf29ef08d3931421", + "sha256:ea302f34477fda3f85560a06d9ebdc7fa41e82420e892fc50b577e35fc6a50b2", + "sha256:eaba923151d9deea315be1f3e2b31cc39a6d1d2f682f942905951f4e40200922", + "sha256:ef9612483cb35171d51d9173647eed5d0069eaa2ee812793a75373447d487aa4", + "sha256:f5315a2eb0239185af1bddb1abf472d877fede3cc8d143c6cddad37678293237", + "sha256:fa0ffcace9b3aa34d205d8130f7873fcfefcb6a4dd3dd705b0dab69af6712642", + "sha256:fc5471e1a54de15ef71c1bc6ebe80d4dc681ea600e68bfd1cbce40427f0b7578" + ], + "index": "pypi", + "version": "==3.8.1" + }, + "aiosignal": { + "hashes": [ + "sha256:26e62109036cd181df6e6ad646f91f0dcfd05fe16d0cb924138ff2ab75d64e3a", + "sha256:78ed67db6c7b7ced4f98e495e572106d5c432a93e1ddd1bf475e1dc05f5b7df2" + ], + "markers": "python_version >= '3.6'", + "version": "==1.2.0" + }, + "alabaster": { + "hashes": [ + "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359", + "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02" + ], + "version": "==0.7.12" + }, + "async-dns": { + "hashes": [ + "sha256:8536be11c3789b154472a86db9df5c2149d5466949c78071019bf5edccbb639e", + "sha256:a257e47cc64022f95d570a1cd7f5fe90c2d8546b24fbe1049c3980a9a5832b96" + ], + "index": "pypi", + "version": "==2.0.0" + }, + "async-lru": { + "hashes": [ + "sha256:baa898027619f5cc31b7966f96f00e4fc0df43ba206a8940a5d1af5336a477cb" + ], + "index": "pypi", + "version": "==1.0.2" + }, + "async-timeout": { + "hashes": [ + "sha256:a22c0b311af23337eb05fcf05a8b51c3ea53729d46fb5460af62bee033cec690", + "sha256:b930cb161a39042f9222f6efb7301399c87eeab394727ec5437924a36d6eef51" + ], + "markers": "python_version >= '3.6'", + "version": "==4.0.1" + }, + "asyncpg": { + "hashes": [ + "sha256:0a61fb196ce4dae2f2fa26eb20a778db21bbee484d2e798cb3cc988de13bdd1b", + "sha256:18d49e2d93a7139a2fdbd113e320cc47075049997268a61bfbe0dde680c55471", + "sha256:191fe6341385b7fdea7dbdcf47fd6db3fd198827dcc1f2b228476d13c05a03c6", + "sha256:1a70783f6ffa34cc7dd2de20a873181414a34fd35a4a208a1f1a7f9f695e4ec4", + "sha256:2633331cbc8429030b4f20f712f8d0fbba57fa8555ee9b2f45f981b81328b256", + "sha256:2bc197fc4aca2fd24f60241057998124012469d2e414aed3f992579db0c88e3a", + "sha256:4327f691b1bdb222df27841938b3e04c14068166b3a97491bec2cb982f49f03e", + "sha256:43cde84e996a3afe75f325a68300093425c2f47d340c0fc8912765cf24a1c095", + "sha256:52fab7f1b2c29e187dd8781fce896249500cf055b63471ad66332e537e9b5f7e", + "sha256:56d88d7ef4341412cd9c68efba323a4519c916979ba91b95d4c08799d2ff0c09", + "sha256:5e4105f57ad1e8fbc8b1e535d8fcefa6ce6c71081228f08680c6dea24384ff0e", + "sha256:63f8e6a69733b285497c2855464a34de657f2cccd25aeaeeb5071872e9382540", + "sha256:649e2966d98cc48d0646d9a4e29abecd8b59d38d55c256d5c857f6b27b7407ac", + "sha256:6f8f5fc975246eda83da8031a14004b9197f510c41511018e7b1bedde6968e92", + "sha256:72a1e12ea0cf7c1e02794b697e3ca967b2360eaa2ce5d4bfdd8604ec2d6b774b", + "sha256:739bbd7f89a2b2f6bc44cb8bf967dab12c5bc714fcbe96e68d512be45ecdf962", + "sha256:863d36eba4a7caa853fd7d83fad5fd5306f050cc2fe6e54fbe10cdb30420e5e9", + "sha256:a738f1b2876f30d710d3dc1e7858160a0afe1603ba16bf5f391f5316eb0ed855", + "sha256:a84d30e6f850bac0876990bcd207362778e2208df0bee8be8da9f1558255e634", + "sha256:acb311722352152936e58a8ee3c5b8e791b24e84cd7d777c414ff05b3530ca68", + "sha256:beaecc52ad39614f6ca2e48c3ca15d56e24a2c15cbfdcb764a4320cc45f02fd5", + "sha256:bf5e3408a14a17d480f36ebaf0401a12ff6ae5457fdf45e4e2775c51cc9517d3", + "sha256:bf6dc9b55b9113f39eaa2057337ce3f9ef7de99a053b8a16360395ce588925cd", + "sha256:ddb4c3263a8d63dcde3d2c4ac1c25206bfeb31fa83bd70fd539e10f87739dee4", + "sha256:f55918ded7b85723a5eaeb34e86e7b9280d4474be67df853ab5a7fa0cc7c6bf2", + "sha256:fe471ccd915b739ca65e2e4dbd92a11b44a5b37f2e38f70827a1c147dafe0fa8" + ], + "index": "pypi", + "version": "==0.25.0" + }, + "attrs": { + "hashes": [ + "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1", + "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==21.2.0" + }, + "babel": { + "hashes": [ + "sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9", + "sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.9.1" + }, + "beautifulsoup4": { + "hashes": [ + "sha256:9a315ce70049920ea4572a4055bc4bd700c940521d36fc858205ad4fcde149bf", + "sha256:c23ad23c521d818955a4151a67d81580319d4bf548d3d49f4223ae041ff98891" + ], + "index": "pypi", + "version": "==4.10.0" + }, + "certifi": { + "hashes": [ + "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872", + "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569" + ], + "version": "==2021.10.8" + }, + "charset-normalizer": { + "hashes": [ + "sha256:735e240d9a8506778cd7a453d97e817e536bb1fc29f4f6961ce297b9c7a917b0", + "sha256:83fcdeb225499d6344c8f7f34684c2981270beacc32ede2e669e94f7fa544405" + ], + "markers": "python_version >= '3'", + "version": "==2.0.8" + }, + "docutils": { + "hashes": [ + "sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125", + "sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==0.17.1" + }, + "elasticsearch": { + "extras": [ + "async" + ], + "hashes": [ + "sha256:436f871848a5020bf9b47495812b229b59bd0c5d7e40adbd5e3c89896b311704", + "sha256:83c299a08fc8737c72454e6d3b2a01ba1b194e4f4d9e4f8bae7058cec326f39f" + ], + "index": "pypi", + "version": "==7.15.2" + }, + "elasticsearch-dsl": { + "hashes": [ + "sha256:046ea10820b94c075081b528b4526c5bc776bda4226d702f269a5f203232064b", + "sha256:c4a7b93882918a413b63bed54018a1685d7410ffd8facbc860ee7fd57f214a6d" + ], + "index": "pypi", + "version": "==7.4.0" + }, + "feedparser": { + "hashes": [ + "sha256:1b7f57841d9cf85074deb316ed2c795091a238adb79846bc46dccdaf80f9c59a", + "sha256:5ce0410a05ab248c8c7cfca3a0ea2203968ee9ff4486067379af4827a59f9661" + ], + "index": "pypi", + "version": "==6.0.8" + }, + "filelock": { + "hashes": [ + "sha256:2e139a228bcf56dd8b2274a65174d005c4a6b68540ee0bdbb92c76f43f29f7e8", + "sha256:93d512b32a23baf4cac44ffd72ccf70732aeff7b8050fcaf6d3ec406d954baf4" + ], + "markers": "python_version >= '3.6'", + "version": "==3.4.0" + }, + "frozenlist": { + "hashes": [ + "sha256:01d79515ed5aa3d699b05f6bdcf1fe9087d61d6b53882aa599a10853f0479c6c", + "sha256:0a7c7cce70e41bc13d7d50f0e5dd175f14a4f1837a8549b0936ed0cbe6170bf9", + "sha256:11ff401951b5ac8c0701a804f503d72c048173208490c54ebb8d7bb7c07a6d00", + "sha256:14a5cef795ae3e28fb504b73e797c1800e9249f950e1c964bb6bdc8d77871161", + "sha256:16eef427c51cb1203a7c0ab59d1b8abccaba9a4f58c4bfca6ed278fc896dc193", + "sha256:16ef7dd5b7d17495404a2e7a49bac1bc13d6d20c16d11f4133c757dd94c4144c", + "sha256:181754275d5d32487431a0a29add4f897968b7157204bc1eaaf0a0ce80c5ba7d", + "sha256:1cf63243bc5f5c19762943b0aa9e0d3fb3723d0c514d820a18a9b9a5ef864315", + "sha256:1cfe6fef507f8bac40f009c85c7eddfed88c1c0d38c75e72fe10476cef94e10f", + "sha256:1fef737fd1388f9b93bba8808c5f63058113c10f4e3c0763ced68431773f72f9", + "sha256:25b358aaa7dba5891b05968dd539f5856d69f522b6de0bf34e61f133e077c1a4", + "sha256:26f602e380a5132880fa245c92030abb0fc6ff34e0c5500600366cedc6adb06a", + "sha256:28e164722ea0df0cf6d48c4d5bdf3d19e87aaa6dfb39b0ba91153f224b912020", + "sha256:2de5b931701257d50771a032bba4e448ff958076380b049fd36ed8738fdb375b", + "sha256:3457f8cf86deb6ce1ba67e120f1b0128fcba1332a180722756597253c465fc1d", + "sha256:351686ca020d1bcd238596b1fa5c8efcbc21bffda9d0efe237aaa60348421e2a", + "sha256:406aeb340613b4b559db78d86864485f68919b7141dec82aba24d1477fd2976f", + "sha256:41de4db9b9501679cf7cddc16d07ac0f10ef7eb58c525a1c8cbff43022bddca4", + "sha256:41f62468af1bd4e4b42b5508a3fe8cc46a693f0cdd0ca2f443f51f207893d837", + "sha256:4766632cd8a68e4f10f156a12c9acd7b1609941525569dd3636d859d79279ed3", + "sha256:47b2848e464883d0bbdcd9493c67443e5e695a84694efff0476f9059b4cb6257", + "sha256:4a495c3d513573b0b3f935bfa887a85d9ae09f0627cf47cad17d0cc9b9ba5c38", + "sha256:4ad065b2ebd09f32511ff2be35c5dfafee6192978b5a1e9d279a5c6e121e3b03", + "sha256:4c457220468d734e3077580a3642b7f682f5fd9507f17ddf1029452450912cdc", + "sha256:4f52d0732e56906f8ddea4bd856192984650282424049c956857fed43697ea43", + "sha256:54a1e09ab7a69f843cd28fefd2bcaf23edb9e3a8d7680032c8968b8ac934587d", + "sha256:5a72eecf37eface331636951249d878750db84034927c997d47f7f78a573b72b", + "sha256:5df31bb2b974f379d230a25943d9bf0d3bc666b4b0807394b131a28fca2b0e5f", + "sha256:66a518731a21a55b7d3e087b430f1956a36793acc15912e2878431c7aec54210", + "sha256:6790b8d96bbb74b7a6f4594b6f131bd23056c25f2aa5d816bd177d95245a30e3", + "sha256:68201be60ac56aff972dc18085800b6ee07973c49103a8aba669dee3d71079de", + "sha256:6e105013fa84623c057a4381dc8ea0361f4d682c11f3816cc80f49a1f3bc17c6", + "sha256:705c184b77565955a99dc360f359e8249580c6b7eaa4dc0227caa861ef46b27a", + "sha256:72cfbeab7a920ea9e74b19aa0afe3b4ad9c89471e3badc985d08756efa9b813b", + "sha256:735f386ec522e384f511614c01d2ef9cf799f051353876b4c6fb93ef67a6d1ee", + "sha256:82d22f6e6f2916e837c91c860140ef9947e31194c82aaeda843d6551cec92f19", + "sha256:83334e84a290a158c0c4cc4d22e8c7cfe0bba5b76d37f1c2509dabd22acafe15", + "sha256:84e97f59211b5b9083a2e7a45abf91cfb441369e8bb6d1f5287382c1c526def3", + "sha256:87521e32e18a2223311afc2492ef2d99946337da0779ddcda77b82ee7319df59", + "sha256:878ebe074839d649a1cdb03a61077d05760624f36d196884a5cafb12290e187b", + "sha256:89fdfc84c6bf0bff2ff3170bb34ecba8a6911b260d318d377171429c4be18c73", + "sha256:8b4c7665a17c3a5430edb663e4ad4e1ad457614d1b2f2b7f87052e2ef4fa45ca", + "sha256:8b54cdd2fda15467b9b0bfa78cee2ddf6dbb4585ef23a16e14926f4b076dfae4", + "sha256:94728f97ddf603d23c8c3dd5cae2644fa12d33116e69f49b1644a71bb77b89ae", + "sha256:954b154a4533ef28bd3e83ffdf4eadf39deeda9e38fb8feaf066d6069885e034", + "sha256:977a1438d0e0d96573fd679d291a1542097ea9f4918a8b6494b06610dfeefbf9", + "sha256:9ade70aea559ca98f4b1b1e5650c45678052e76a8ab2f76d90f2ac64180215a2", + "sha256:9b6e21e5770df2dea06cb7b6323fbc008b13c4a4e3b52cb54685276479ee7676", + "sha256:a0d3ffa8772464441b52489b985d46001e2853a3b082c655ec5fad9fb6a3d618", + "sha256:a37594ad6356e50073fe4f60aa4187b97d15329f2138124d252a5a19c8553ea4", + "sha256:a8d86547a5e98d9edd47c432f7a14b0c5592624b496ae9880fb6332f34af1edc", + "sha256:aa44c4740b4e23fcfa259e9dd52315d2b1770064cde9507457e4c4a65a04c397", + "sha256:acc4614e8d1feb9f46dd829a8e771b8f5c4b1051365d02efb27a3229048ade8a", + "sha256:af2a51c8a381d76eabb76f228f565ed4c3701441ecec101dd18be70ebd483cfd", + "sha256:b2ae2f5e9fa10805fb1c9adbfefaaecedd9e31849434be462c3960a0139ed729", + "sha256:b46f997d5ed6d222a863b02cdc9c299101ee27974d9bbb2fd1b3c8441311c408", + "sha256:bc93f5f62df3bdc1f677066327fc81f92b83644852a31c6aa9b32c2dde86ea7d", + "sha256:bfbaa08cf1452acad9cb1c1d7b89394a41e712f88df522cea1a0f296b57782a0", + "sha256:c1e8e9033d34c2c9e186e58279879d78c94dd365068a3607af33f2bc99357a53", + "sha256:c5328ed53fdb0a73c8a50105306a3bc013e5ca36cca714ec4f7bd31d38d8a97f", + "sha256:c6a9d84ee6427b65a81fc24e6ef589cb794009f5ca4150151251c062773e7ed2", + "sha256:c98d3c04701773ad60d9545cd96df94d955329efc7743fdb96422c4b669c633b", + "sha256:cb3957c39668d10e2b486acc85f94153520a23263b6401e8f59422ef65b9520d", + "sha256:e63ad0beef6ece06475d29f47d1f2f29727805376e09850ebf64f90777962792", + "sha256:e74f8b4d8677ebb4015ac01fcaf05f34e8a1f22775db1f304f497f2f88fdc697", + "sha256:e7d0dd3e727c70c2680f5f09a0775525229809f1a35d8552b92ff10b2b14f2c2", + "sha256:ec6cf345771cdb00791d271af9a0a6fbfc2b6dd44cb753f1eeaa256e21622adb", + "sha256:ed58803563a8c87cf4c0771366cf0ad1aa265b6b0ae54cbbb53013480c7ad74d", + "sha256:f0081a623c886197ff8de9e635528fd7e6a387dccef432149e25c13946cb0cd0", + "sha256:f025f1d6825725b09c0038775acab9ae94264453a696cc797ce20c0769a7b367", + "sha256:f5f3b2942c3b8b9bfe76b408bbaba3d3bb305ee3693e8b1d631fe0a0d4f93673", + "sha256:fbd4844ff111449f3bbe20ba24fbb906b5b1c2384d0f3287c9f7da2354ce6d23" + ], + "markers": "python_version >= '3.6'", + "version": "==1.2.0" + }, + "gcld3": { + "hashes": [ + "sha256:11a127e493c2952a83a957e2f025d6ff8b1d2efd353baa25bd99d7ceb3c96c54", + "sha256:47c8c779bfe7372a38564b0cd357556dc362aec81cb55b0c889059e8b952e959", + "sha256:4fc4ae1c8c7baab21a46fc66074787d010ca338f5c4b2ff80dd2448d18cc3d89", + "sha256:51538b26dd1741f49ceb03421710d4355c6ce3fd257a2ddebe868e36458f5d45", + "sha256:b56a9852861c7209434917885b8331ddf5f11d3d5810baafb9a29ffd5ccd35d1", + "sha256:c6bc0ecf3d95943a2b9fe61214aab8284a3b33f25e07c6caad064ea59417620f", + "sha256:fb745958278bfcc230fe3e15af3babb2752e77afd929ae0f95b9ef52648f3271" + ], + "index": "pypi", + "version": "==3.0.13" + }, + "idna": { + "hashes": [ + "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", + "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" + ], + "markers": "python_version >= '3'", + "version": "==3.3" + }, + "imagesize": { + "hashes": [ + "sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c", + "sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.3.0" + }, + "jinja2": { + "hashes": [ + "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8", + "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7" + ], + "markers": "python_version >= '3.6'", + "version": "==3.0.3" + }, + "markupsafe": { + "hashes": [ + "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298", + "sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64", + "sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b", + "sha256:04635854b943835a6ea959e948d19dcd311762c5c0c6e1f0e16ee57022669194", + "sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567", + "sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff", + "sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724", + "sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74", + "sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646", + "sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35", + "sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6", + "sha256:20dca64a3ef2d6e4d5d615a3fd418ad3bde77a47ec8a23d984a12b5b4c74491a", + "sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6", + "sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad", + "sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26", + "sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38", + "sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac", + "sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7", + "sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6", + "sha256:4296f2b1ce8c86a6aea78613c34bb1a672ea0e3de9c6ba08a960efe0b0a09047", + "sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75", + "sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f", + "sha256:4dc8f9fb58f7364b63fd9f85013b780ef83c11857ae79f2feda41e270468dd9b", + "sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135", + "sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8", + "sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a", + "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a", + "sha256:5b6d930f030f8ed98e3e6c98ffa0652bdb82601e7a016ec2ab5d7ff23baa78d1", + "sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9", + "sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864", + "sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914", + "sha256:6300b8454aa6930a24b9618fbb54b5a68135092bc666f7b06901f897fa5c2fee", + "sha256:63f3268ba69ace99cab4e3e3b5840b03340efed0948ab8f78d2fd87ee5442a4f", + "sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18", + "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8", + "sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2", + "sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d", + "sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b", + "sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b", + "sha256:89c687013cb1cd489a0f0ac24febe8c7a666e6e221b783e53ac50ebf68e45d86", + "sha256:8d206346619592c6200148b01a2142798c989edcb9c896f9ac9722a99d4e77e6", + "sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f", + "sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb", + "sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833", + "sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28", + "sha256:9f02365d4e99430a12647f09b6cc8bab61a6564363f313126f775eb4f6ef798e", + "sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415", + "sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902", + "sha256:aca6377c0cb8a8253e493c6b451565ac77e98c2951c45f913e0b52facdcff83f", + "sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d", + "sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9", + "sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d", + "sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145", + "sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066", + "sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c", + "sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1", + "sha256:cdfba22ea2f0029c9261a4bd07e830a8da012291fbe44dc794e488b6c9bb353a", + "sha256:d6c7ebd4e944c85e2c3421e612a7057a2f48d478d79e61800d81468a8d842207", + "sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f", + "sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53", + "sha256:deb993cacb280823246a026e3b2d81c493c53de6acfd5e6bfe31ab3402bb37dd", + "sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134", + "sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85", + "sha256:f0567c4dc99f264f49fe27da5f735f414c4e7e7dd850cfd8e69f0862d7c74ea9", + "sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5", + "sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94", + "sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509", + "sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51", + "sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872" + ], + "markers": "python_version >= '3.6'", + "version": "==2.0.1" + }, + "multidict": { + "hashes": [ + "sha256:06560fbdcf22c9387100979e65b26fba0816c162b888cb65b845d3def7a54c9b", + "sha256:067150fad08e6f2dd91a650c7a49ba65085303fcc3decbd64a57dc13a2733031", + "sha256:0a2cbcfbea6dc776782a444db819c8b78afe4db597211298dd8b2222f73e9cd0", + "sha256:0dd1c93edb444b33ba2274b66f63def8a327d607c6c790772f448a53b6ea59ce", + "sha256:0fed465af2e0eb6357ba95795d003ac0bdb546305cc2366b1fc8f0ad67cc3fda", + "sha256:116347c63ba049c1ea56e157fa8aa6edaf5e92925c9b64f3da7769bdfa012858", + "sha256:1b4ac3ba7a97b35a5ccf34f41b5a8642a01d1e55454b699e5e8e7a99b5a3acf5", + "sha256:1c7976cd1c157fa7ba5456ae5d31ccdf1479680dc9b8d8aa28afabc370df42b8", + "sha256:246145bff76cc4b19310f0ad28bd0769b940c2a49fc601b86bfd150cbd72bb22", + "sha256:25cbd39a9029b409167aa0a20d8a17f502d43f2efebfe9e3ac019fe6796c59ac", + "sha256:28e6d883acd8674887d7edc896b91751dc2d8e87fbdca8359591a13872799e4e", + "sha256:2d1d55cdf706ddc62822d394d1df53573d32a7a07d4f099470d3cb9323b721b6", + "sha256:2e77282fd1d677c313ffcaddfec236bf23f273c4fba7cdf198108f5940ae10f5", + "sha256:32fdba7333eb2351fee2596b756d730d62b5827d5e1ab2f84e6cbb287cc67fe0", + "sha256:35591729668a303a02b06e8dba0eb8140c4a1bfd4c4b3209a436a02a5ac1de11", + "sha256:380b868f55f63d048a25931a1632818f90e4be71d2081c2338fcf656d299949a", + "sha256:3822c5894c72e3b35aae9909bef66ec83e44522faf767c0ad39e0e2de11d3b55", + "sha256:38ba256ee9b310da6a1a0f013ef4e422fca30a685bcbec86a969bd520504e341", + "sha256:3bc3b1621b979621cee9f7b09f024ec76ec03cc365e638126a056317470bde1b", + "sha256:3d2d7d1fff8e09d99354c04c3fd5b560fb04639fd45926b34e27cfdec678a704", + "sha256:517d75522b7b18a3385726b54a081afd425d4f41144a5399e5abd97ccafdf36b", + "sha256:5f79c19c6420962eb17c7e48878a03053b7ccd7b69f389d5831c0a4a7f1ac0a1", + "sha256:5f841c4f14331fd1e36cbf3336ed7be2cb2a8f110ce40ea253e5573387db7621", + "sha256:637c1896497ff19e1ee27c1c2c2ddaa9f2d134bbb5e0c52254361ea20486418d", + "sha256:6ee908c070020d682e9b42c8f621e8bb10c767d04416e2ebe44e37d0f44d9ad5", + "sha256:77f0fb7200cc7dedda7a60912f2059086e29ff67cefbc58d2506638c1a9132d7", + "sha256:7878b61c867fb2df7a95e44b316f88d5a3742390c99dfba6c557a21b30180cac", + "sha256:78c106b2b506b4d895ddc801ff509f941119394b89c9115580014127414e6c2d", + "sha256:8b911d74acdc1fe2941e59b4f1a278a330e9c34c6c8ca1ee21264c51ec9b67ef", + "sha256:93de39267c4c676c9ebb2057e98a8138bade0d806aad4d864322eee0803140a0", + "sha256:9416cf11bcd73c861267e88aea71e9fcc35302b3943e45e1dbb4317f91a4b34f", + "sha256:94b117e27efd8e08b4046c57461d5a114d26b40824995a2eb58372b94f9fca02", + "sha256:9815765f9dcda04921ba467957be543423e5ec6a1136135d84f2ae092c50d87b", + "sha256:98ec9aea6223adf46999f22e2c0ab6cf33f5914be604a404f658386a8f1fba37", + "sha256:a37e9a68349f6abe24130846e2f1d2e38f7ddab30b81b754e5a1fde32f782b23", + "sha256:a43616aec0f0d53c411582c451f5d3e1123a68cc7b3475d6f7d97a626f8ff90d", + "sha256:a4771d0d0ac9d9fe9e24e33bed482a13dfc1256d008d101485fe460359476065", + "sha256:a5635bcf1b75f0f6ef3c8a1ad07b500104a971e38d3683167b9454cb6465ac86", + "sha256:a9acb76d5f3dd9421874923da2ed1e76041cb51b9337fd7f507edde1d86535d6", + "sha256:ac42181292099d91217a82e3fa3ce0e0ddf3a74fd891b7c2b347a7f5aa0edded", + "sha256:b227345e4186809d31f22087d0265655114af7cda442ecaf72246275865bebe4", + "sha256:b61f85101ef08cbbc37846ac0e43f027f7844f3fade9b7f6dd087178caedeee7", + "sha256:b70913cbf2e14275013be98a06ef4b412329fe7b4f83d64eb70dce8269ed1e1a", + "sha256:b9aad49466b8d828b96b9e3630006234879c8d3e2b0a9d99219b3121bc5cdb17", + "sha256:baf1856fab8212bf35230c019cde7c641887e3fc08cadd39d32a421a30151ea3", + "sha256:bd6c9c50bf2ad3f0448edaa1a3b55b2e6866ef8feca5d8dbec10ec7c94371d21", + "sha256:c1ff762e2ee126e6f1258650ac641e2b8e1f3d927a925aafcfde943b77a36d24", + "sha256:c30ac9f562106cd9e8071c23949a067b10211917fdcb75b4718cf5775356a940", + "sha256:c9631c642e08b9fff1c6255487e62971d8b8e821808ddd013d8ac058087591ac", + "sha256:cdd68778f96216596218b4e8882944d24a634d984ee1a5a049b300377878fa7c", + "sha256:ce8cacda0b679ebc25624d5de66c705bc53dcc7c6f02a7fb0f3ca5e227d80422", + "sha256:cfde464ca4af42a629648c0b0d79b8f295cf5b695412451716531d6916461628", + "sha256:d3def943bfd5f1c47d51fd324df1e806d8da1f8e105cc7f1c76a1daf0f7e17b0", + "sha256:d9b668c065968c5979fe6b6fa6760bb6ab9aeb94b75b73c0a9c1acf6393ac3bf", + "sha256:da7d57ea65744d249427793c042094c4016789eb2562576fb831870f9c878d9e", + "sha256:dc3a866cf6c13d59a01878cd806f219340f3e82eed514485e094321f24900677", + "sha256:df23c83398715b26ab09574217ca21e14694917a0c857e356fd39e1c64f8283f", + "sha256:dfc924a7e946dd3c6360e50e8f750d51e3ef5395c95dc054bc9eab0f70df4f9c", + "sha256:e4a67f1080123de76e4e97a18d10350df6a7182e243312426d508712e99988d4", + "sha256:e5283c0a00f48e8cafcecadebfa0ed1dac8b39e295c7248c44c665c16dc1138b", + "sha256:e58a9b5cc96e014ddf93c2227cbdeca94b56a7eb77300205d6e4001805391747", + "sha256:e6453f3cbeb78440747096f239d282cc57a2997a16b5197c9bc839099e1633d0", + "sha256:e6c4fa1ec16e01e292315ba76eb1d012c025b99d22896bd14a66628b245e3e01", + "sha256:e7d81ce5744757d2f05fc41896e3b2ae0458464b14b5a2c1e87a6a9d69aefaa8", + "sha256:ea21d4d5104b4f840b91d9dc8cbc832aba9612121eaba503e54eaab1ad140eb9", + "sha256:ecc99bce8ee42dcad15848c7885197d26841cb24fa2ee6e89d23b8993c871c64", + "sha256:f0bb0973f42ffcb5e3537548e0767079420aefd94ba990b61cf7bb8d47f4916d", + "sha256:f19001e790013ed580abfde2a4465388950728861b52f0da73e8e8a9418533c0", + "sha256:f76440e480c3b2ca7f843ff8a48dc82446b86ed4930552d736c0bac507498a52", + "sha256:f9bef5cff994ca3026fcc90680e326d1a19df9841c5e3d224076407cc21471a1", + "sha256:fc66d4016f6e50ed36fb39cd287a3878ffcebfa90008535c62e0e90a7ab713ae", + "sha256:fd77c8f3cba815aa69cb97ee2b2ef385c7c12ada9c734b0f3b32e26bb88bbf1d" + ], + "markers": "python_version >= '3.6'", + "version": "==5.2.0" + }, + "numpy": { + "hashes": [ + "sha256:011e4c430f2e2739e0d182cb7e2b5d47adc46a8db49a788e5798805b7878c4ba", + "sha256:013fa3500a6e5b3ba51401056aa9c41d83a7e737959d15f288d410f26cc33896", + "sha256:0ebb646ef72a2348036ed1692e6bb3f3dd4f8d026681b7168a9ac988d9832c27", + "sha256:21613822dd597d4645c586ac21910fded5344f843410dace91c129a38c31d8be", + "sha256:2242fa31413e40847016234485f228fa5e082b0c555d3db65fe9aa4efcfb8d8d", + "sha256:2934fb435d85341efb40f9db637a203a042300afdaa49f833608df21a5d8ae30", + "sha256:56109e7e9b205439990e90682163d8155cf5743efe65c30221ef3834621ffd3f", + "sha256:5e56515f5abb493bd32d2196ecd3ce794792419adfb7d8b4cccd4ddaf74ab924", + "sha256:6730a1495f1acedd97e82e32cca4d8dbe07b89f01f395ca02ca4a9e110d9519d", + "sha256:6759e6dafd96454be2d6dd80674293322191639400832688cd234c5f483ce1a9", + "sha256:7dbfa0abe053afbcb9e61ec1557556e4e30c3e4b5df4ec7849bf245e8c09feec", + "sha256:8c5016694b9bda77cda32ebfdde34d2246978ed4c49e9baab26bcf38621b7390", + "sha256:91bb1e29d74a90861e878b0c7bc941a1c0ac051cb4b171dc242e66953c95ca1e", + "sha256:a2dd58beb8a8266d704a76692e8eb76ff20f5b2940db7aeee216c2dbf226e5c6", + "sha256:b00d9bf43cc8975cf5e0c211d218e75a3f5ce1ae34dc84d8a489c28a0dba7848", + "sha256:b0ed56b9d7535d654d2a0478333cc08d1b9849767eafd07e1f6a3d8d90a2cad0", + "sha256:bc991b3f8ea7c0f6703df2bc23c098cfe6f1a3a5e8a3a901eb6a5619275d53ff", + "sha256:ccf027e3bbcd06b5c26a0196ddfc24c4d09d2001cc5d38738efff9d9ac8dee58", + "sha256:d0be0eb7df39f0e0732d73250de55e1dcc8086c23db970d5eab85dbf0713502d", + "sha256:e48368972e0999af098e0a6e9a3573895fd4c3b0b2d8c5cf215b17910cd6c124", + "sha256:e981667470ae74f06cfd0d54c5fa9cd88661a27eccaac2cba505039f0b29dc2e", + "sha256:eb6dd744a9f94b424bf70d62b7874798ea95b6b58fb63ec651b69a46872e5bd5" + ], + "markers": "python_version >= '3.8'", + "version": "==1.22.0rc1" + }, + "packaging": { + "hashes": [ + "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", + "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522" + ], + "markers": "python_version >= '3.6'", + "version": "==21.3" + }, + "pygments": { + "hashes": [ + "sha256:b8e67fe6af78f492b3c4b3e2970c0624cbf08beb1e493b2c99b9fa1b67a20380", + "sha256:f398865f7eb6874156579fdf36bc840a03cab64d1cde9e93d68f46a425ec52c6" + ], + "markers": "python_version >= '3.5'", + "version": "==2.10.0" + }, + "pypandoc": { + "hashes": [ + "sha256:080903342d8cca6d953835c103b0f280a6cb66a6a20102692143a138b046c44f", + "sha256:6ea03c8e92d561b0b47ff91ee9a777c50a8d3a325f3272ea4fefef4bb6562b91" + ], + "index": "pypi", + "version": "==1.6.4" + }, + "pyparsing": { + "hashes": [ + "sha256:04ff808a5b90911829c55c4e26f75fa5ca8a2f5f36aa3a51f68e27033341d3e4", + "sha256:d9bdec0013ef1eb5a84ab39a3b3868911598afa494f5faa038647101504e2b81" + ], + "markers": "python_version >= '3.6'", + "version": "==3.0.6" + }, + "python-dateutil": { + "hashes": [ + "sha256:0123cacc1627ae19ddf3c27a5de5bd67ee4586fbdd6440d9748f8abb483d3e86", + "sha256:961d03dc3453ebbc59dbdea9e4e11c5651520a876d0f4db161e8674aae935da9" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.8.2" + }, + "pytidylib": { + "hashes": [ + "sha256:22b1c8d75970d8064ff999c2369e98af1d0685417eda4c829a5c9f56764b0af3" + ], + "index": "pypi", + "version": "==0.3.2" + }, + "pytz": { + "hashes": [ + "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c", + "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326" + ], + "index": "pypi", + "version": "==2021.3" + }, + "pyyaml": { + "hashes": [ + "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293", + "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b", + "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57", + "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b", + "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4", + "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07", + "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba", + "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9", + "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287", + "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513", + "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0", + "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0", + "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92", + "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f", + "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2", + "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc", + "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c", + "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86", + "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4", + "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c", + "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34", + "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b", + "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c", + "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb", + "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737", + "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3", + "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d", + "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53", + "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78", + "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803", + "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a", + "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174", + "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5" + ], + "index": "pypi", + "version": "==6.0" + }, + "requests": { + "hashes": [ + "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24", + "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", + "version": "==2.26.0" + }, + "requests-file": { + "hashes": [ + "sha256:07d74208d3389d01c38ab89ef403af0cfec63957d53a0081d8eca738d0247d8e", + "sha256:dfe5dae75c12481f68ba353183c53a65e6044c923e64c24b2209f6c7570ca953" + ], + "version": "==1.5.1" + }, + "sgmllib3k": { + "hashes": [ + "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9" + ], + "version": "==1.0.0" + }, + "simhash": { + "hashes": [ + "sha256:0245b465fbe0bd17a74f5b89b9a70c3061984e37d7d94214eb5a8ef545384b6d", + "sha256:18d9c476d1bec9fa039293e4659ef49976585f9e051cb78afec30c4ce8fa361a", + "sha256:a4f84ac68b9afff17c9f1e6046ba60ed5eff40578ddf8d6a3d54709c44fafea0", + "sha256:d486d44a1dde0245d0733b91c86d892e87a062c932a372d184f4d9ce970e2708", + "sha256:debaf4fff92f192dc0414f31fda1ef90069936b3d05ec520d2c790128c48ee9a" + ], + "index": "pypi", + "version": "==2.0.0" + }, + "six": { + "hashes": [ + "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", + "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.16.0" + }, + "snowballstemmer": { + "hashes": [ + "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1", + "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a" + ], + "version": "==2.2.0" + }, + "soupsieve": { + "hashes": [ + "sha256:1a3cca2617c6b38c0343ed661b1fa5de5637f257d4fe22bd9f1338010a1efefb", + "sha256:b8d49b1cd4f037c7082a9683dfa1801aa2597fb11c3a1155b7a5b94829b4f1f9" + ], + "markers": "python_version >= '3.6'", + "version": "==2.3.1" + }, + "sphinx": { + "hashes": [ + "sha256:048dac56039a5713f47a554589dc98a442b39226a2b9ed7f82797fcb2fe9253f", + "sha256:32a5b3e9a1b176cc25ed048557d4d3d01af635e6b76c5bc7a43b0a34447fbd45" + ], + "markers": "python_version >= '3.6'", + "version": "==4.3.1" + }, + "sphinx-rtd-theme": { + "hashes": [ + "sha256:4d35a56f4508cfee4c4fb604373ede6feae2a306731d533f409ef5c3496fdbd8", + "sha256:eec6d497e4c2195fa0e8b2016b337532b8a699a68bcb22a512870e16925c6a5c" + ], + "index": "pypi", + "version": "==1.0.0" + }, + "sphinxcontrib-applehelp": { + "hashes": [ + "sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a", + "sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.2" + }, + "sphinxcontrib-devhelp": { + "hashes": [ + "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e", + "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.2" + }, + "sphinxcontrib-htmlhelp": { + "hashes": [ + "sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07", + "sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2" + ], + "markers": "python_version >= '3.6'", + "version": "==2.0.0" + }, + "sphinxcontrib-jsmath": { + "hashes": [ + "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", + "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.1" + }, + "sphinxcontrib-qthelp": { + "hashes": [ + "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72", + "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.3" + }, + "sphinxcontrib-serializinghtml": { + "hashes": [ + "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd", + "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952" + ], + "markers": "python_version >= '3.5'", + "version": "==1.1.5" + }, + "tika": { + "hashes": [ + "sha256:c2c50f405622f74531841104f9e85c17511aede11de8e5385eab1a29a31f191b" + ], + "index": "pypi", + "version": "==1.24" + }, + "tldextract": { + "hashes": [ + "sha256:d2034c3558651f7d8fdadea83fb681050b2d662dc67a00d950326dc902029444", + "sha256:f55e05f6bf4cc952a87d13594386d32ad2dd265630a8bdfc3df03bd60425c6b0" + ], + "index": "pypi", + "version": "==3.1.2" + }, + "types-pyyaml": { + "hashes": [ + "sha256:2e27b0118ca4248a646101c5c318dc02e4ca2866d6bc42e84045dbb851555a76", + "sha256:d5b318269652e809b5c30a5fe666c50159ab80bfd41cd6bafe655bf20b29fcba" + ], + "index": "pypi", + "version": "==6.0.1" + }, + "typing-extensions": { + "hashes": [ + "sha256:2cdf80e4e04866a9b3689a51869016d36db0814d84b8d8a568d22781d45d27ed", + "sha256:829704698b22e13ec9eaf959122315eabb370b0884400e9818334d8b677023d9" + ], + "markers": "python_version >= '3.6'", + "version": "==4.0.0" + }, + "urllib3": { + "hashes": [ + "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece", + "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", + "version": "==1.26.7" + }, + "voluptuous": { + "hashes": [ + "sha256:4db1ac5079db9249820d49c891cb4660a6f8cae350491210abce741fabf56513" + ], + "index": "pypi", + "version": "==0.12.2" + }, + "wheel": { + "hashes": [ + "sha256:21014b2bd93c6d0034b6ba5d35e4eb284340e09d63c59aef6fc14b0f346146fd", + "sha256:e2ef7239991699e3355d54f8e968a21bb940a1dbf34a4d226741e64462516fad" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==0.37.0" + }, + "yarl": { + "hashes": [ + "sha256:044daf3012e43d4b3538562da94a88fb12a6490652dbc29fb19adfa02cf72eac", + "sha256:0cba38120db72123db7c58322fa69e3c0efa933040ffb586c3a87c063ec7cae8", + "sha256:167ab7f64e409e9bdd99333fe8c67b5574a1f0495dcfd905bc7454e766729b9e", + "sha256:1be4bbb3d27a4e9aa5f3df2ab61e3701ce8fcbd3e9846dbce7c033a7e8136746", + "sha256:1ca56f002eaf7998b5fcf73b2421790da9d2586331805f38acd9997743114e98", + "sha256:1d3d5ad8ea96bd6d643d80c7b8d5977b4e2fb1bab6c9da7322616fd26203d125", + "sha256:1eb6480ef366d75b54c68164094a6a560c247370a68c02dddb11f20c4c6d3c9d", + "sha256:1edc172dcca3f11b38a9d5c7505c83c1913c0addc99cd28e993efeaafdfaa18d", + "sha256:211fcd65c58bf250fb994b53bc45a442ddc9f441f6fec53e65de8cba48ded986", + "sha256:29e0656d5497733dcddc21797da5a2ab990c0cb9719f1f969e58a4abac66234d", + "sha256:368bcf400247318382cc150aaa632582d0780b28ee6053cd80268c7e72796dec", + "sha256:39d5493c5ecd75c8093fa7700a2fb5c94fe28c839c8e40144b7ab7ccba6938c8", + "sha256:3abddf0b8e41445426d29f955b24aeecc83fa1072be1be4e0d194134a7d9baee", + "sha256:3bf8cfe8856708ede6a73907bf0501f2dc4e104085e070a41f5d88e7faf237f3", + "sha256:3ec1d9a0d7780416e657f1e405ba35ec1ba453a4f1511eb8b9fbab81cb8b3ce1", + "sha256:45399b46d60c253327a460e99856752009fcee5f5d3c80b2f7c0cae1c38d56dd", + "sha256:52690eb521d690ab041c3919666bea13ab9fbff80d615ec16fa81a297131276b", + "sha256:534b047277a9a19d858cde163aba93f3e1677d5acd92f7d10ace419d478540de", + "sha256:580c1f15500e137a8c37053e4cbf6058944d4c114701fa59944607505c2fe3a0", + "sha256:59218fef177296451b23214c91ea3aba7858b4ae3306dde120224cfe0f7a6ee8", + "sha256:5ba63585a89c9885f18331a55d25fe81dc2d82b71311ff8bd378fc8004202ff6", + "sha256:5bb7d54b8f61ba6eee541fba4b83d22b8a046b4ef4d8eb7f15a7e35db2e1e245", + "sha256:6152224d0a1eb254f97df3997d79dadd8bb2c1a02ef283dbb34b97d4f8492d23", + "sha256:67e94028817defe5e705079b10a8438b8cb56e7115fa01640e9c0bb3edf67332", + "sha256:695ba021a9e04418507fa930d5f0704edbce47076bdcfeeaba1c83683e5649d1", + "sha256:6a1a9fe17621af43e9b9fcea8bd088ba682c8192d744b386ee3c47b56eaabb2c", + "sha256:6ab0c3274d0a846840bf6c27d2c60ba771a12e4d7586bf550eefc2df0b56b3b4", + "sha256:6feca8b6bfb9eef6ee057628e71e1734caf520a907b6ec0d62839e8293e945c0", + "sha256:737e401cd0c493f7e3dd4db72aca11cfe069531c9761b8ea474926936b3c57c8", + "sha256:788713c2896f426a4e166b11f4ec538b5736294ebf7d5f654ae445fd44270832", + "sha256:797c2c412b04403d2da075fb93c123df35239cd7b4cc4e0cd9e5839b73f52c58", + "sha256:8300401dc88cad23f5b4e4c1226f44a5aa696436a4026e456fe0e5d2f7f486e6", + "sha256:87f6e082bce21464857ba58b569370e7b547d239ca22248be68ea5d6b51464a1", + "sha256:89ccbf58e6a0ab89d487c92a490cb5660d06c3a47ca08872859672f9c511fc52", + "sha256:8b0915ee85150963a9504c10de4e4729ae700af11df0dc5550e6587ed7891e92", + "sha256:8cce6f9fa3df25f55521fbb5c7e4a736683148bcc0c75b21863789e5185f9185", + "sha256:95a1873b6c0dd1c437fb3bb4a4aaa699a48c218ac7ca1e74b0bee0ab16c7d60d", + "sha256:9b4c77d92d56a4c5027572752aa35082e40c561eec776048330d2907aead891d", + "sha256:9bfcd43c65fbb339dc7086b5315750efa42a34eefad0256ba114cd8ad3896f4b", + "sha256:9c1f083e7e71b2dd01f7cd7434a5f88c15213194df38bc29b388ccdf1492b739", + "sha256:a1d0894f238763717bdcfea74558c94e3bc34aeacd3351d769460c1a586a8b05", + "sha256:a467a431a0817a292121c13cbe637348b546e6ef47ca14a790aa2fa8cc93df63", + "sha256:aa32aaa97d8b2ed4e54dc65d241a0da1c627454950f7d7b1f95b13985afd6c5d", + "sha256:ac10bbac36cd89eac19f4e51c032ba6b412b3892b685076f4acd2de18ca990aa", + "sha256:ac35ccde589ab6a1870a484ed136d49a26bcd06b6a1c6397b1967ca13ceb3913", + "sha256:bab827163113177aee910adb1f48ff7af31ee0289f434f7e22d10baf624a6dfe", + "sha256:baf81561f2972fb895e7844882898bda1eef4b07b5b385bcd308d2098f1a767b", + "sha256:bf19725fec28452474d9887a128e98dd67eee7b7d52e932e6949c532d820dc3b", + "sha256:c01a89a44bb672c38f42b49cdb0ad667b116d731b3f4c896f72302ff77d71656", + "sha256:c0910c6b6c31359d2f6184828888c983d54d09d581a4a23547a35f1d0b9484b1", + "sha256:c10ea1e80a697cf7d80d1ed414b5cb8f1eec07d618f54637067ae3c0334133c4", + "sha256:c1164a2eac148d85bbdd23e07dfcc930f2e633220f3eb3c3e2a25f6148c2819e", + "sha256:c145ab54702334c42237a6c6c4cc08703b6aa9b94e2f227ceb3d477d20c36c63", + "sha256:c17965ff3706beedafd458c452bf15bac693ecd146a60a06a214614dc097a271", + "sha256:c19324a1c5399b602f3b6e7db9478e5b1adf5cf58901996fc973fe4fccd73eed", + "sha256:c2a1ac41a6aa980db03d098a5531f13985edcb451bcd9d00670b03129922cd0d", + "sha256:c6ddcd80d79c96eb19c354d9dca95291589c5954099836b7c8d29278a7ec0bda", + "sha256:c9c6d927e098c2d360695f2e9d38870b2e92e0919be07dbe339aefa32a090265", + "sha256:cc8b7a7254c0fc3187d43d6cb54b5032d2365efd1df0cd1749c0c4df5f0ad45f", + "sha256:cff3ba513db55cc6a35076f32c4cdc27032bd075c9faef31fec749e64b45d26c", + "sha256:d260d4dc495c05d6600264a197d9d6f7fc9347f21d2594926202fd08cf89a8ba", + "sha256:d6f3d62e16c10e88d2168ba2d065aa374e3c538998ed04996cd373ff2036d64c", + "sha256:da6df107b9ccfe52d3a48165e48d72db0eca3e3029b5b8cb4fe6ee3cb870ba8b", + "sha256:dfe4b95b7e00c6635a72e2d00b478e8a28bfb122dc76349a06e20792eb53a523", + "sha256:e39378894ee6ae9f555ae2de332d513a5763276a9265f8e7cbaeb1b1ee74623a", + "sha256:ede3b46cdb719c794427dcce9d8beb4abe8b9aa1e97526cc20de9bd6583ad1ef", + "sha256:f2a8508f7350512434e41065684076f640ecce176d262a7d54f0da41d99c5a95", + "sha256:f44477ae29025d8ea87ec308539f95963ffdc31a82f42ca9deecf2d505242e72", + "sha256:f64394bd7ceef1237cc604b5a89bf748c95982a84bcd3c4bbeb40f685c810794", + "sha256:fc4dd8b01a8112809e6b636b00f487846956402834a7fd59d46d4f4267181c41", + "sha256:fce78593346c014d0d986b7ebc80d782b7f5e19843ca798ed62f8e3ba8728576", + "sha256:fd547ec596d90c8676e369dd8a581a21227fe9b4ad37d0dc7feb4ccf544c2d59" + ], + "markers": "python_version >= '3.6'", + "version": "==1.7.2" + } + }, + "develop": { + "alabaster": { + "hashes": [ + "sha256:446438bdcca0e05bd45ea2de1668c1d9b032e1a9154c2c259092d77031ddd359", + "sha256:a661d72d58e6ea8a57f7a86e37d86716863ee5e92788398526d58b26a4e4dc02" + ], + "version": "==0.7.12" + }, + "attrs": { + "hashes": [ + "sha256:149e90d6d8ac20db7a955ad60cf0e6881a3f20d37096140088356da6c716b0b1", + "sha256:ef6aaac3ca6cd92904cdd0d83f629a15f18053ec84e6432106f7a4d04ae4f5fb" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==21.2.0" + }, + "autoflake": { + "hashes": [ + "sha256:61a353012cff6ab94ca062823d1fb2f692c4acda51c76ff83a8d77915fba51ea" + ], + "index": "pypi", + "version": "==1.4" + }, + "babel": { + "hashes": [ + "sha256:ab49e12b91d937cd11f0b67cb259a57ab4ad2b59ac7a3b41d6c06c0ac5b0def9", + "sha256:bc0c176f9f6a994582230df350aa6e05ba2ebe4b3ac317eab29d9be5d2768da0" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.9.1" + }, + "backports.entry-points-selectable": { + "hashes": [ + "sha256:7fceed9532a7aa2bd888654a7314f864a3c16a4e710b34a58cfc0f08114c663b", + "sha256:914b21a479fde881635f7af5adc7f6e38d6b274be32269070c53b698c60d5386" + ], + "markers": "python_version >= '2.7'", + "version": "==1.1.1" + }, + "black": { + "hashes": [ + "sha256:802c6c30b637b28645b7fde282ed2569c0cd777dbe493a41b6a03c1d903f99ac", + "sha256:a042adbb18b3262faad5aff4e834ff186bb893f95ba3a8013f09de1e5569def2" + ], + "index": "pypi", + "version": "==21.11b1" + }, + "blacken-docs": { + "hashes": [ + "sha256:3e8138b22c33406cef5946058e535a8aca45cd64b8e7d392b3bd1329fc1f4af8", + "sha256:a81e0abc9771521f445ee582f469c8ec2f5880c19c369d766bb151f79f642d7b" + ], + "index": "pypi", + "version": "==1.12.0" + }, + "certifi": { + "hashes": [ + "sha256:78884e7c1d4b00ce3cea67b44566851c4343c120abd683433ce934a68ea58872", + "sha256:d62a0163eb4c2344ac042ab2bdf75399a71a2d8c7d47eac2e2ee91b9d6339569" + ], + "version": "==2021.10.8" + }, + "cfgv": { + "hashes": [ + "sha256:c6a0883f3917a037485059700b9e75da2464e6c27051014ad85ba6aaa5884426", + "sha256:f5a830efb9ce7a445376bb66ec94c638a9787422f96264c98edc6bdeed8ab736" + ], + "markers": "python_full_version >= '3.6.1'", + "version": "==3.3.1" + }, + "charset-normalizer": { + "hashes": [ + "sha256:735e240d9a8506778cd7a453d97e817e536bb1fc29f4f6961ce297b9c7a917b0", + "sha256:83fcdeb225499d6344c8f7f34684c2981270beacc32ede2e669e94f7fa544405" + ], + "markers": "python_version >= '3'", + "version": "==2.0.8" + }, + "click": { + "hashes": [ + "sha256:d2b5255c7c6349bc1bd1e59e08cd12acbbd63ce649f2588755783aa94dfb6b1a", + "sha256:dacca89f4bfadd5de3d7489b7c8a566eee0d3676333fbb50030263894c38c0dc" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==7.1.2" + }, + "colorama": { + "hashes": [ + "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b", + "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==0.4.4" + }, + "distlib": { + "hashes": [ + "sha256:c8b54e8454e5bf6237cc84c20e8264c3e991e824ef27e8f1e81049867d861e31", + "sha256:d982d0751ff6eaaab5e2ec8e691d949ee80eddf01a62eaa96ddb11531fe16b05" + ], + "version": "==0.3.3" + }, + "docutils": { + "hashes": [ + "sha256:686577d2e4c32380bb50cbb22f575ed742d58168cee37e99117a854bcd88f125", + "sha256:cf316c8370a737a022b72b56874f6602acf974a37a9fba42ec2876387549fc61" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==0.17.1" + }, + "filelock": { + "hashes": [ + "sha256:2e139a228bcf56dd8b2274a65174d005c4a6b68540ee0bdbb92c76f43f29f7e8", + "sha256:93d512b32a23baf4cac44ffd72ccf70732aeff7b8050fcaf6d3ec406d954baf4" + ], + "markers": "python_version >= '3.6'", + "version": "==3.4.0" + }, + "identify": { + "hashes": [ + "sha256:a33ae873287e81651c7800ca309dc1f84679b763c9c8b30680e16fbfa82f0107", + "sha256:eba31ca80258de6bb51453084bff4a923187cd2193b9c13710f2516ab30732cc" + ], + "markers": "python_full_version >= '3.6.1'", + "version": "==2.4.0" + }, + "idna": { + "hashes": [ + "sha256:84d9dd047ffa80596e0f246e2eab0b391788b0503584e8945f2368256d2735ff", + "sha256:9d643ff0a55b762d5cdb124b8eaa99c66322e2157b69160bc32796e824360e6d" + ], + "markers": "python_version >= '3'", + "version": "==3.3" + }, + "imagesize": { + "hashes": [ + "sha256:1db2f82529e53c3e929e8926a1fa9235aa82d0bd0c580359c67ec31b2fddaa8c", + "sha256:cd1750d452385ca327479d45b64d9c7729ecf0b3969a58148298c77092261f9d" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.3.0" + }, + "interrogate": { + "hashes": [ + "sha256:a4ccc5cbd727c74acc98dee6f5e79ef264c0bcfa66b68d4e123069b2af89091a", + "sha256:b6f325f0aa84ac3ac6779d8708264d366102226c5af7d69058cecffcff7a6d6c" + ], + "index": "pypi", + "version": "==1.5.0" + }, + "isort": { + "hashes": [ + "sha256:6f62d78e2f89b4500b080fe3a81690850cd254227f27f75c3a0c491a1f351ba7", + "sha256:e8443a5e7a020e9d7f97f1d7d9cd17c88bcb3bc7e218bf9cf5095fe550be2951" + ], + "index": "pypi", + "version": "==5.10.1" + }, + "jinja2": { + "hashes": [ + "sha256:077ce6014f7b40d03b47d1f1ca4b0fc8328a692bd284016f806ed0eaca390ad8", + "sha256:611bb273cd68f3b993fabdc4064fc858c5b47a973cb5aa7999ec1ba405c87cd7" + ], + "markers": "python_version >= '3.6'", + "version": "==3.0.3" + }, + "libcst": { + "hashes": [ + "sha256:2e1f77fbaaff93b889376c92f588b718edbdc21f956abbe27d10dfd1ff2d76c3", + "sha256:330f9082a309bad808e283e80845a843200303bb256690185b98ca458a62c4f8" + ], + "markers": "python_version >= '3.6'", + "version": "==0.3.23" + }, + "markdown-it-py": { + "hashes": [ + "sha256:36be6bb3ad987bfdb839f5ba78ddf094552ca38ccbd784ae4f74a4e1419fc6e3", + "sha256:98080fc0bc34c4f2bcf0846a096a9429acbd9d5d8e67ed34026c03c61c464389" + ], + "markers": "python_version ~= '3.6'", + "version": "==1.1.0" + }, + "markupsafe": { + "hashes": [ + "sha256:01a9b8ea66f1658938f65b93a85ebe8bc016e6769611be228d797c9d998dd298", + "sha256:023cb26ec21ece8dc3907c0e8320058b2e0cb3c55cf9564da612bc325bed5e64", + "sha256:0446679737af14f45767963a1a9ef7620189912317d095f2d9ffa183a4d25d2b", + "sha256:04635854b943835a6ea959e948d19dcd311762c5c0c6e1f0e16ee57022669194", + "sha256:0717a7390a68be14b8c793ba258e075c6f4ca819f15edfc2a3a027c823718567", + "sha256:0955295dd5eec6cb6cc2fe1698f4c6d84af2e92de33fbcac4111913cd100a6ff", + "sha256:0d4b31cc67ab36e3392bbf3862cfbadac3db12bdd8b02a2731f509ed5b829724", + "sha256:10f82115e21dc0dfec9ab5c0223652f7197feb168c940f3ef61563fc2d6beb74", + "sha256:168cd0a3642de83558a5153c8bd34f175a9a6e7f6dc6384b9655d2697312a646", + "sha256:1d609f577dc6e1aa17d746f8bd3c31aa4d258f4070d61b2aa5c4166c1539de35", + "sha256:1f2ade76b9903f39aa442b4aadd2177decb66525062db244b35d71d0ee8599b6", + "sha256:20dca64a3ef2d6e4d5d615a3fd418ad3bde77a47ec8a23d984a12b5b4c74491a", + "sha256:2a7d351cbd8cfeb19ca00de495e224dea7e7d919659c2841bbb7f420ad03e2d6", + "sha256:2d7d807855b419fc2ed3e631034685db6079889a1f01d5d9dac950f764da3dad", + "sha256:2ef54abee730b502252bcdf31b10dacb0a416229b72c18b19e24a4509f273d26", + "sha256:36bc903cbb393720fad60fc28c10de6acf10dc6cc883f3e24ee4012371399a38", + "sha256:37205cac2a79194e3750b0af2a5720d95f786a55ce7df90c3af697bfa100eaac", + "sha256:3c112550557578c26af18a1ccc9e090bfe03832ae994343cfdacd287db6a6ae7", + "sha256:3dd007d54ee88b46be476e293f48c85048603f5f516008bee124ddd891398ed6", + "sha256:4296f2b1ce8c86a6aea78613c34bb1a672ea0e3de9c6ba08a960efe0b0a09047", + "sha256:47ab1e7b91c098ab893b828deafa1203de86d0bc6ab587b160f78fe6c4011f75", + "sha256:49e3ceeabbfb9d66c3aef5af3a60cc43b85c33df25ce03d0031a608b0a8b2e3f", + "sha256:4dc8f9fb58f7364b63fd9f85013b780ef83c11857ae79f2feda41e270468dd9b", + "sha256:4efca8f86c54b22348a5467704e3fec767b2db12fc39c6d963168ab1d3fc9135", + "sha256:53edb4da6925ad13c07b6d26c2a852bd81e364f95301c66e930ab2aef5b5ddd8", + "sha256:5855f8438a7d1d458206a2466bf82b0f104a3724bf96a1c781ab731e4201731a", + "sha256:594c67807fb16238b30c44bdf74f36c02cdf22d1c8cda91ef8a0ed8dabf5620a", + "sha256:5b6d930f030f8ed98e3e6c98ffa0652bdb82601e7a016ec2ab5d7ff23baa78d1", + "sha256:5bb28c636d87e840583ee3adeb78172efc47c8b26127267f54a9c0ec251d41a9", + "sha256:60bf42e36abfaf9aff1f50f52644b336d4f0a3fd6d8a60ca0d054ac9f713a864", + "sha256:611d1ad9a4288cf3e3c16014564df047fe08410e628f89805e475368bd304914", + "sha256:6300b8454aa6930a24b9618fbb54b5a68135092bc666f7b06901f897fa5c2fee", + "sha256:63f3268ba69ace99cab4e3e3b5840b03340efed0948ab8f78d2fd87ee5442a4f", + "sha256:6557b31b5e2c9ddf0de32a691f2312a32f77cd7681d8af66c2692efdbef84c18", + "sha256:693ce3f9e70a6cf7d2fb9e6c9d8b204b6b39897a2c4a1aa65728d5ac97dcc1d8", + "sha256:6a7fae0dd14cf60ad5ff42baa2e95727c3d81ded453457771d02b7d2b3f9c0c2", + "sha256:6c4ca60fa24e85fe25b912b01e62cb969d69a23a5d5867682dd3e80b5b02581d", + "sha256:6fcf051089389abe060c9cd7caa212c707e58153afa2c649f00346ce6d260f1b", + "sha256:7d91275b0245b1da4d4cfa07e0faedd5b0812efc15b702576d103293e252af1b", + "sha256:89c687013cb1cd489a0f0ac24febe8c7a666e6e221b783e53ac50ebf68e45d86", + "sha256:8d206346619592c6200148b01a2142798c989edcb9c896f9ac9722a99d4e77e6", + "sha256:905fec760bd2fa1388bb5b489ee8ee5f7291d692638ea5f67982d968366bef9f", + "sha256:97383d78eb34da7e1fa37dd273c20ad4320929af65d156e35a5e2d89566d9dfb", + "sha256:984d76483eb32f1bcb536dc27e4ad56bba4baa70be32fa87152832cdd9db0833", + "sha256:99df47edb6bda1249d3e80fdabb1dab8c08ef3975f69aed437cb69d0a5de1e28", + "sha256:9f02365d4e99430a12647f09b6cc8bab61a6564363f313126f775eb4f6ef798e", + "sha256:a30e67a65b53ea0a5e62fe23682cfe22712e01f453b95233b25502f7c61cb415", + "sha256:ab3ef638ace319fa26553db0624c4699e31a28bb2a835c5faca8f8acf6a5a902", + "sha256:aca6377c0cb8a8253e493c6b451565ac77e98c2951c45f913e0b52facdcff83f", + "sha256:add36cb2dbb8b736611303cd3bfcee00afd96471b09cda130da3581cbdc56a6d", + "sha256:b2f4bf27480f5e5e8ce285a8c8fd176c0b03e93dcc6646477d4630e83440c6a9", + "sha256:b7f2d075102dc8c794cbde1947378051c4e5180d52d276987b8d28a3bd58c17d", + "sha256:baa1a4e8f868845af802979fcdbf0bb11f94f1cb7ced4c4b8a351bb60d108145", + "sha256:be98f628055368795d818ebf93da628541e10b75b41c559fdf36d104c5787066", + "sha256:bf5d821ffabf0ef3533c39c518f3357b171a1651c1ff6827325e4489b0e46c3c", + "sha256:c47adbc92fc1bb2b3274c4b3a43ae0e4573d9fbff4f54cd484555edbf030baf1", + "sha256:cdfba22ea2f0029c9261a4bd07e830a8da012291fbe44dc794e488b6c9bb353a", + "sha256:d6c7ebd4e944c85e2c3421e612a7057a2f48d478d79e61800d81468a8d842207", + "sha256:d7f9850398e85aba693bb640262d3611788b1f29a79f0c93c565694658f4071f", + "sha256:d8446c54dc28c01e5a2dbac5a25f071f6653e6e40f3a8818e8b45d790fe6ef53", + "sha256:deb993cacb280823246a026e3b2d81c493c53de6acfd5e6bfe31ab3402bb37dd", + "sha256:e0f138900af21926a02425cf736db95be9f4af72ba1bb21453432a07f6082134", + "sha256:e9936f0b261d4df76ad22f8fee3ae83b60d7c3e871292cd42f40b81b70afae85", + "sha256:f0567c4dc99f264f49fe27da5f735f414c4e7e7dd850cfd8e69f0862d7c74ea9", + "sha256:f5653a225f31e113b152e56f154ccbe59eeb1c7487b39b9d9f9cdb58e6c79dc5", + "sha256:f826e31d18b516f653fe296d967d700fddad5901ae07c622bb3705955e1faa94", + "sha256:f8ba0e8349a38d3001fae7eadded3f6606f0da5d748ee53cc1dab1d6527b9509", + "sha256:f9081981fe268bd86831e5c75f7de206ef275defcb82bc70740ae6dc507aee51", + "sha256:fa130dd50c57d53368c9d59395cb5526eda596d3ffe36666cd81a44d56e48872" + ], + "markers": "python_version >= '3.6'", + "version": "==2.0.1" + }, + "mdit-py-plugins": { + "hashes": [ + "sha256:1833bf738e038e35d89cb3a07eb0d227ed647ce7dd357579b65343740c6d249c", + "sha256:5991cef645502e80a5388ec4fc20885d2313d4871e8b8e320ca2de14ac0c015f" + ], + "markers": "python_version ~= '3.6'", + "version": "==0.2.8" + }, + "mypy": { + "hashes": [ + "sha256:088cd9c7904b4ad80bec811053272986611b84221835e079be5bcad029e79dd9", + "sha256:0aadfb2d3935988ec3815952e44058a3100499f5be5b28c34ac9d79f002a4a9a", + "sha256:119bed3832d961f3a880787bf621634ba042cb8dc850a7429f643508eeac97b9", + "sha256:1a85e280d4d217150ce8cb1a6dddffd14e753a4e0c3cf90baabb32cefa41b59e", + "sha256:3c4b8ca36877fc75339253721f69603a9c7fdb5d4d5a95a1a1b899d8b86a4de2", + "sha256:3e382b29f8e0ccf19a2df2b29a167591245df90c0b5a2542249873b5c1d78212", + "sha256:42c266ced41b65ed40a282c575705325fa7991af370036d3f134518336636f5b", + "sha256:53fd2eb27a8ee2892614370896956af2ff61254c275aaee4c230ae771cadd885", + "sha256:704098302473cb31a218f1775a873b376b30b4c18229421e9e9dc8916fd16150", + "sha256:7df1ead20c81371ccd6091fa3e2878559b5c4d4caadaf1a484cf88d93ca06703", + "sha256:866c41f28cee548475f146aa4d39a51cf3b6a84246969f3759cb3e9c742fc072", + "sha256:a155d80ea6cee511a3694b108c4494a39f42de11ee4e61e72bc424c490e46457", + "sha256:adaeee09bfde366d2c13fe6093a7df5df83c9a2ba98638c7d76b010694db760e", + "sha256:b6fb13123aeef4a3abbcfd7e71773ff3ff1526a7d3dc538f3929a49b42be03f0", + "sha256:b94e4b785e304a04ea0828759172a15add27088520dc7e49ceade7834275bedb", + "sha256:c0df2d30ed496a08de5daed2a9ea807d07c21ae0ab23acf541ab88c24b26ab97", + "sha256:c6c2602dffb74867498f86e6129fd52a2770c48b7cd3ece77ada4fa38f94eba8", + "sha256:ceb6e0a6e27fb364fb3853389607cf7eb3a126ad335790fa1e14ed02fba50811", + "sha256:d9dd839eb0dc1bbe866a288ba3c1afc33a202015d2ad83b31e875b5905a079b6", + "sha256:e4dab234478e3bd3ce83bac4193b2ecd9cf94e720ddd95ce69840273bf44f6de", + "sha256:ec4e0cd079db280b6bdabdc807047ff3e199f334050db5cbb91ba3e959a67504", + "sha256:ecd2c3fe726758037234c93df7e98deb257fd15c24c9180dacf1ef829da5f921", + "sha256:ef565033fa5a958e62796867b1df10c40263ea9ded87164d67572834e57a174d" + ], + "index": "pypi", + "version": "==0.910" + }, + "mypy-extensions": { + "hashes": [ + "sha256:090fedd75945a69ae91ce1303b5824f428daf5a028d2f6ab8a299250a846f15d", + "sha256:2d82818f5bb3e369420cb3c4060a7970edba416647068eb4c5343488a6c604a8" + ], + "version": "==0.4.3" + }, + "myst-parser": { + "hashes": [ + "sha256:40124b6f27a4c42ac7f06b385e23a9dcd03d84801e9c7130b59b3729a554b1f9", + "sha256:f7f3b2d62db7655cde658eb5d62b2ec2a4631308137bd8d10f296a40d57bbbeb" + ], + "index": "pypi", + "version": "==0.15.2" + }, + "nodeenv": { + "hashes": [ + "sha256:3ef13ff90291ba2a4a7a4ff9a979b63ffdd00a464dbe04acf0ea6471517a4c2b", + "sha256:621e6b7076565ddcacd2db0294c0381e01fd28945ab36bcf00f41c5daf63bef7" + ], + "version": "==1.6.0" + }, + "packaging": { + "hashes": [ + "sha256:dd47c42927d89ab911e606518907cc2d3a1f38bbd026385970643f9c5b8ecfeb", + "sha256:ef103e05f519cdc783ae24ea4e2e0f508a9c99b2d4969652eed6a2e1ea5bd522" + ], + "markers": "python_version >= '3.6'", + "version": "==21.3" + }, + "pathspec": { + "hashes": [ + "sha256:7d15c4ddb0b5c802d161efc417ec1a2558ea2653c2e8ad9c19098201dc1c993a", + "sha256:e564499435a2673d586f6b2130bb5b95f04a3ba06f81b8f895b651a3c76aabb1" + ], + "version": "==0.9.0" + }, + "platformdirs": { + "hashes": [ + "sha256:367a5e80b3d04d2428ffa76d33f124cf11e8fff2acdaa9b43d545f5c7d661ef2", + "sha256:8868bbe3c3c80d42f20156f22e7131d2fb321f5bc86a2a345375c6481a67021d" + ], + "markers": "python_version >= '3.6'", + "version": "==2.4.0" + }, + "pre-commit": { + "hashes": [ + "sha256:3c25add78dbdfb6a28a651780d5c311ac40dd17f160eb3954a0c59da40a505a7", + "sha256:a4ed01000afcb484d9eb8d504272e642c4c4099bbad3a6b27e519bd6a3e928a6" + ], + "index": "pypi", + "version": "==2.15.0" + }, + "py": { + "hashes": [ + "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719", + "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==1.11.0" + }, + "pybetter": { + "hashes": [ + "sha256:09379896e43b0da9e3b37b3d3aef0bc89d19b2646e023d984ef64e018cd21648", + "sha256:73ddc060e92eb0a192c4d7ba97cf26ea0c525a9d27bae7a917344a0c79d1755b" + ], + "index": "pypi", + "version": "==0.3.7" + }, + "pyemojify": { + "hashes": [ + "sha256:6bbc3c8d52e3df3e4039bc0cad3616d3eb579b4c6e15a11bd5e0ef0d579596a9", + "sha256:e70e4cfcfe0aed7b5bc64f39b023d5d62a5f5c0c31c1b7114cd43a059fb14a72" + ], + "version": "==0.2.0" + }, + "pyflakes": { + "hashes": [ + "sha256:05a85c2872edf37a4ed30b0cce2f6093e1d0581f8c19d7393122da7e25b2b24c", + "sha256:3bb3a3f256f4b7968c9c788781e4ff07dce46bdf12339dcda61053375426ee2e" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==2.4.0" + }, + "pygments": { + "hashes": [ + "sha256:b8e67fe6af78f492b3c4b3e2970c0624cbf08beb1e493b2c99b9fa1b67a20380", + "sha256:f398865f7eb6874156579fdf36bc840a03cab64d1cde9e93d68f46a425ec52c6" + ], + "markers": "python_version >= '3.5'", + "version": "==2.10.0" + }, + "pyparsing": { + "hashes": [ + "sha256:04ff808a5b90911829c55c4e26f75fa5ca8a2f5f36aa3a51f68e27033341d3e4", + "sha256:d9bdec0013ef1eb5a84ab39a3b3868911598afa494f5faa038647101504e2b81" + ], + "markers": "python_version >= '3.6'", + "version": "==3.0.6" + }, + "pytz": { + "hashes": [ + "sha256:3672058bc3453457b622aab7a1c3bfd5ab0bdae451512f6cf25f64ed37f5b87c", + "sha256:acad2d8b20a1af07d4e4c9d2e9285c5ed9104354062f275f3fcd88dcef4f1326" + ], + "index": "pypi", + "version": "==2021.3" + }, + "pyyaml": { + "hashes": [ + "sha256:0283c35a6a9fbf047493e3a0ce8d79ef5030852c51e9d911a27badfde0605293", + "sha256:055d937d65826939cb044fc8c9b08889e8c743fdc6a32b33e2390f66013e449b", + "sha256:07751360502caac1c067a8132d150cf3d61339af5691fe9e87803040dbc5db57", + "sha256:0b4624f379dab24d3725ffde76559cff63d9ec94e1736b556dacdfebe5ab6d4b", + "sha256:0ce82d761c532fe4ec3f87fc45688bdd3a4c1dc5e0b4a19814b9009a29baefd4", + "sha256:1e4747bc279b4f613a09eb64bba2ba602d8a6664c6ce6396a4d0cd413a50ce07", + "sha256:213c60cd50106436cc818accf5baa1aba61c0189ff610f64f4a3e8c6726218ba", + "sha256:231710d57adfd809ef5d34183b8ed1eeae3f76459c18fb4a0b373ad56bedcdd9", + "sha256:277a0ef2981ca40581a47093e9e2d13b3f1fbbeffae064c1d21bfceba2030287", + "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513", + "sha256:40527857252b61eacd1d9af500c3337ba8deb8fc298940291486c465c8b46ec0", + "sha256:473f9edb243cb1935ab5a084eb238d842fb8f404ed2193a915d1784b5a6b5fc0", + "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92", + "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f", + "sha256:68fb519c14306fec9720a2a5b45bc9f0c8d1b9c72adf45c37baedfcd949c35a2", + "sha256:77f396e6ef4c73fdc33a9157446466f1cff553d979bd00ecb64385760c6babdc", + "sha256:819b3830a1543db06c4d4b865e70ded25be52a2e0631ccd2f6a47a2822f2fd7c", + "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86", + "sha256:98c4d36e99714e55cfbaaee6dd5badbc9a1ec339ebfc3b1f52e293aee6bb71a4", + "sha256:9df7ed3b3d2e0ecfe09e14741b857df43adb5a3ddadc919a2d94fbdf78fea53c", + "sha256:9fa600030013c4de8165339db93d182b9431076eb98eb40ee068700c9c813e34", + "sha256:a80a78046a72361de73f8f395f1f1e49f956c6be882eed58505a15f3e430962b", + "sha256:b3d267842bf12586ba6c734f89d1f5b871df0273157918b0ccefa29deb05c21c", + "sha256:b5b9eccad747aabaaffbc6064800670f0c297e52c12754eb1d976c57e4f74dcb", + "sha256:c5687b8d43cf58545ade1fe3e055f70eac7a5a1a0bf42824308d868289a95737", + "sha256:cba8c411ef271aa037d7357a2bc8f9ee8b58b9965831d9e51baf703280dc73d3", + "sha256:d15a181d1ecd0d4270dc32edb46f7cb7733c7c508857278d3d378d14d606db2d", + "sha256:d4db7c7aef085872ef65a8fd7d6d09a14ae91f691dec3e87ee5ee0539d516f53", + "sha256:d4eccecf9adf6fbcc6861a38015c2a64f38b9d94838ac1810a9023a0609e1b78", + "sha256:d67d839ede4ed1b28a4e8909735fc992a923cdb84e618544973d7dfc71540803", + "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a", + "sha256:e61ceaab6f49fb8bdfaa0f92c4b57bcfbea54c09277b1b4f7ac376bfb7a7c174", + "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5" + ], + "index": "pypi", + "version": "==6.0" + }, + "regex": { + "hashes": [ + "sha256:0416f7399e918c4b0e074a0f66e5191077ee2ca32a0f99d4c187a62beb47aa05", + "sha256:05b7d6d7e64efe309972adab77fc2af8907bb93217ec60aa9fe12a0dad35874f", + "sha256:0617383e2fe465732af4509e61648b77cbe3aee68b6ac8c0b6fe934db90be5cc", + "sha256:07856afef5ffcc052e7eccf3213317fbb94e4a5cd8177a2caa69c980657b3cb4", + "sha256:0f594b96fe2e0821d026365f72ac7b4f0b487487fb3d4aaf10dd9d97d88a9737", + "sha256:139a23d1f5d30db2cc6c7fd9c6d6497872a672db22c4ae1910be22d4f4b2068a", + "sha256:162abfd74e88001d20cb73ceaffbfe601469923e875caf9118333b1a4aaafdc4", + "sha256:2207ae4f64ad3af399e2d30dde66f0b36ae5c3129b52885f1bffc2f05ec505c8", + "sha256:2409b5c9cef7054dde93a9803156b411b677affc84fca69e908b1cb2c540025d", + "sha256:2fee3ed82a011184807d2127f1733b4f6b2ff6ec7151d83ef3477f3b96a13d03", + "sha256:30ab804ea73972049b7a2a5c62d97687d69b5a60a67adca07eb73a0ddbc9e29f", + "sha256:3598893bde43091ee5ca0a6ad20f08a0435e93a69255eeb5f81b85e81e329264", + "sha256:3b5df18db1fccd66de15aa59c41e4f853b5df7550723d26aa6cb7f40e5d9da5a", + "sha256:3c5fb32cc6077abad3bbf0323067636d93307c9fa93e072771cf9a64d1c0f3ef", + "sha256:416c5f1a188c91e3eb41e9c8787288e707f7d2ebe66e0a6563af280d9b68478f", + "sha256:42b50fa6666b0d50c30a990527127334d6b96dd969011e843e726a64011485da", + "sha256:432bd15d40ed835a51617521d60d0125867f7b88acf653e4ed994a1f8e4995dc", + "sha256:473e67837f786404570eae33c3b64a4b9635ae9f00145250851a1292f484c063", + "sha256:4aaa4e0705ef2b73dd8e36eeb4c868f80f8393f5f4d855e94025ce7ad8525f50", + "sha256:50a7ddf3d131dc5633dccdb51417e2d1910d25cbcf842115a3a5893509140a3a", + "sha256:529801a0d58809b60b3531ee804d3e3be4b412c94b5d267daa3de7fadef00f49", + "sha256:537ca6a3586931b16a85ac38c08cc48f10fc870a5b25e51794c74df843e9966d", + "sha256:53db2c6be8a2710b359bfd3d3aa17ba38f8aa72a82309a12ae99d3c0c3dcd74d", + "sha256:5537f71b6d646f7f5f340562ec4c77b6e1c915f8baae822ea0b7e46c1f09b733", + "sha256:563d5f9354e15e048465061509403f68424fef37d5add3064038c2511c8f5e00", + "sha256:5d408a642a5484b9b4d11dea15a489ea0928c7e410c7525cd892f4d04f2f617b", + "sha256:61600a7ca4bcf78a96a68a27c2ae9389763b5b94b63943d5158f2a377e09d29a", + "sha256:6650f16365f1924d6014d2ea770bde8555b4a39dc9576abb95e3cd1ff0263b36", + "sha256:666abff54e474d28ff42756d94544cdfd42e2ee97065857413b72e8a2d6a6345", + "sha256:68a067c11463de2a37157930d8b153005085e42bcb7ad9ca562d77ba7d1404e0", + "sha256:6e1d2cc79e8dae442b3fa4a26c5794428b98f81389af90623ffcc650ce9f6732", + "sha256:74cbeac0451f27d4f50e6e8a8f3a52ca074b5e2da9f7b505c4201a57a8ed6286", + "sha256:780b48456a0f0ba4d390e8b5f7c661fdd218934388cde1a974010a965e200e12", + "sha256:788aef3549f1924d5c38263104dae7395bf020a42776d5ec5ea2b0d3d85d6646", + "sha256:7ee1227cf08b6716c85504aebc49ac827eb88fcc6e51564f010f11a406c0a667", + "sha256:7f301b11b9d214f83ddaf689181051e7f48905568b0c7017c04c06dfd065e244", + "sha256:83ee89483672b11f8952b158640d0c0ff02dc43d9cb1b70c1564b49abe92ce29", + "sha256:85bfa6a5413be0ee6c5c4a663668a2cad2cbecdee367630d097d7823041bdeec", + "sha256:9345b6f7ee578bad8e475129ed40123d265464c4cfead6c261fd60fc9de00bcf", + "sha256:93a5051fcf5fad72de73b96f07d30bc29665697fb8ecdfbc474f3452c78adcf4", + "sha256:962b9a917dd7ceacbe5cd424556914cb0d636001e393b43dc886ba31d2a1e449", + "sha256:96fc32c16ea6d60d3ca7f63397bff5c75c5a562f7db6dec7d412f7c4d2e78ec0", + "sha256:98ba568e8ae26beb726aeea2273053c717641933836568c2a0278a84987b2a1a", + "sha256:a3feefd5e95871872673b08636f96b61ebef62971eab044f5124fb4dea39919d", + "sha256:a955b747d620a50408b7fdf948e04359d6e762ff8a85f5775d907ceced715129", + "sha256:b43c2b8a330a490daaef5a47ab114935002b13b3f9dc5da56d5322ff218eeadb", + "sha256:b483c9d00a565633c87abd0aaf27eb5016de23fed952e054ecc19ce32f6a9e7e", + "sha256:b9ed0b1e5e0759d6b7f8e2f143894b2a7f3edd313f38cf44e1e15d360e11749b", + "sha256:ba05430e819e58544e840a68b03b28b6d328aff2e41579037e8bab7653b37d83", + "sha256:ca49e1ab99593438b204e00f3970e7a5f70d045267051dfa6b5f4304fcfa1dbf", + "sha256:ca5f18a75e1256ce07494e245cdb146f5a9267d3c702ebf9b65c7f8bd843431e", + "sha256:cd410a1cbb2d297c67d8521759ab2ee3f1d66206d2e4328502a487589a2cb21b", + "sha256:ce298e3d0c65bd03fa65ffcc6db0e2b578e8f626d468db64fdf8457731052942", + "sha256:d5ca078bb666c4a9d1287a379fe617a6dccd18c3e8a7e6c7e1eb8974330c626a", + "sha256:d5fd67df77bab0d3f4ea1d7afca9ef15c2ee35dfb348c7b57ffb9782a6e4db6e", + "sha256:da1a90c1ddb7531b1d5ff1e171b4ee61f6345119be7351104b67ff413843fe94", + "sha256:dba70f30fd81f8ce6d32ddeef37d91c8948e5d5a4c63242d16a2b2df8143aafc", + "sha256:dc07f021ee80510f3cd3af2cad5b6a3b3a10b057521d9e6aaeb621730d320c5a", + "sha256:dd33eb9bdcfbabab3459c9ee651d94c842bc8a05fabc95edf4ee0c15a072495e", + "sha256:e0538c43565ee6e703d3a7c3bdfe4037a5209250e8502c98f20fea6f5fdf2965", + "sha256:e1f54b9b4b6c53369f40028d2dd07a8c374583417ee6ec0ea304e710a20f80a0", + "sha256:e32d2a2b02ccbef10145df9135751abea1f9f076e67a4e261b05f24b94219e36", + "sha256:e6096b0688e6e14af6a1b10eaad86b4ff17935c49aa774eac7c95a57a4e8c296", + "sha256:e71255ba42567d34a13c03968736c5d39bb4a97ce98188fafb27ce981115beec", + "sha256:ed2e07c6a26ed4bea91b897ee2b0835c21716d9a469a96c3e878dc5f8c55bb23", + "sha256:eef2afb0fd1747f33f1ee3e209bce1ed582d1896b240ccc5e2697e3275f037c7", + "sha256:f23222527b307970e383433daec128d769ff778d9b29343fb3496472dc20dabe", + "sha256:f341ee2df0999bfdf7a95e448075effe0db212a59387de1a70690e4acb03d4c6", + "sha256:f5be7805e53dafe94d295399cfbe5227f39995a997f4fd8539bf3cbdc8f47ca8", + "sha256:f7f325be2804246a75a4f45c72d4ce80d2443ab815063cdf70ee8fb2ca59ee1b", + "sha256:f8af619e3be812a2059b212064ea7a640aff0568d972cd1b9e920837469eb3cb", + "sha256:fa8c626d6441e2d04b6ee703ef2d1e17608ad44c7cb75258c09dd42bacdfc64b", + "sha256:fbb9dc00e39f3e6c0ef48edee202f9520dafb233e8b51b06b8428cfcb92abd30", + "sha256:fff55f3ce50a3ff63ec8e2a8d3dd924f1941b250b0aac3d3d42b687eeff07a8e" + ], + "version": "==2021.11.10" + }, + "requests": { + "hashes": [ + "sha256:6c1246513ecd5ecd4528a0906f910e8f0f9c6b8ec72030dc9fd154dc1a6efd24", + "sha256:b8aa58f8cf793ffd8782d3d8cb19e66ef36f7aba4353eec859e74678b01b07a7" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'", + "version": "==2.26.0" + }, + "six": { + "hashes": [ + "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926", + "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==1.16.0" + }, + "snowballstemmer": { + "hashes": [ + "sha256:09b16deb8547d3412ad7b590689584cd0fe25ec8db3be37788be3810cbf19cb1", + "sha256:c8e1716e83cc398ae16824e5572ae04e0d9fc2c6b985fb0f900f5f0c96ecba1a" + ], + "version": "==2.2.0" + }, + "sphinx": { + "hashes": [ + "sha256:048dac56039a5713f47a554589dc98a442b39226a2b9ed7f82797fcb2fe9253f", + "sha256:32a5b3e9a1b176cc25ed048557d4d3d01af635e6b76c5bc7a43b0a34447fbd45" + ], + "markers": "python_version >= '3.6'", + "version": "==4.3.1" + }, + "sphinxcontrib-applehelp": { + "hashes": [ + "sha256:806111e5e962be97c29ec4c1e7fe277bfd19e9652fb1a4392105b43e01af885a", + "sha256:a072735ec80e7675e3f432fcae8610ecf509c5f1869d17e2eecff44389cdbc58" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.2" + }, + "sphinxcontrib-devhelp": { + "hashes": [ + "sha256:8165223f9a335cc1af7ffe1ed31d2871f325254c0423bc0c4c7cd1c1e4734a2e", + "sha256:ff7f1afa7b9642e7060379360a67e9c41e8f3121f2ce9164266f61b9f4b338e4" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.2" + }, + "sphinxcontrib-htmlhelp": { + "hashes": [ + "sha256:d412243dfb797ae3ec2b59eca0e52dac12e75a241bf0e4eb861e450d06c6ed07", + "sha256:f5f8bb2d0d629f398bf47d0d69c07bc13b65f75a81ad9e2f71a63d4b7a2f6db2" + ], + "markers": "python_version >= '3.6'", + "version": "==2.0.0" + }, + "sphinxcontrib-jsmath": { + "hashes": [ + "sha256:2ec2eaebfb78f3f2078e73666b1415417a116cc848b72e5172e596c871103178", + "sha256:a9925e4a4587247ed2191a22df5f6970656cb8ca2bd6284309578f2153e0c4b8" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.1" + }, + "sphinxcontrib-qthelp": { + "hashes": [ + "sha256:4c33767ee058b70dba89a6fc5c1892c0d57a54be67ddd3e7875a18d14cba5a72", + "sha256:bd9fc24bcb748a8d51fd4ecaade681350aa63009a347a8c14e637895444dfab6" + ], + "markers": "python_version >= '3.5'", + "version": "==1.0.3" + }, + "sphinxcontrib-serializinghtml": { + "hashes": [ + "sha256:352a9a00ae864471d3a7ead8d7d79f5fc0b57e8b3f95e9867eb9eb28999b92fd", + "sha256:aa5f6de5dfdf809ef505c4895e51ef5c9eac17d0f287933eb49ec495280b6952" + ], + "markers": "python_version >= '3.5'", + "version": "==1.1.5" + }, + "tabulate": { + "hashes": [ + "sha256:d7c013fe7abbc5e491394e10fa845f8f32fe54f8dc60c6622c6cf482d25d47e4", + "sha256:eb1d13f25760052e8931f2ef80aaf6045a6cceb47514db8beab24cded16f13a7" + ], + "version": "==0.8.9" + }, + "toml": { + "hashes": [ + "sha256:806143ae5bfb6a3c6e736a764057db0e6a0e05e338b5630894a5f779cabb4f9b", + "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f" + ], + "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", + "version": "==0.10.2" + }, + "tomli": { + "hashes": [ + "sha256:c6ce0015eb38820eaf32b5db832dbc26deb3dd427bd5f6556cf0acac2c214fee", + "sha256:f04066f68f5554911363063a30b108d2b5a5b1a010aa8b6132af78489fe3aade" + ], + "markers": "python_version >= '3.6'", + "version": "==1.2.2" + }, + "types-pytz": { + "hashes": [ + "sha256:d58a0688094b768d8e21c044e45861cbcaecba0494fd5b9c5feb3e1739211606", + "sha256:dffd77f3efecd3b1555f187a9bf3a638d55fac296700b829c41bd51ec72a6eb7" + ], + "index": "pypi", + "version": "==2021.3.1" + }, + "types-pyyaml": { + "hashes": [ + "sha256:2e27b0118ca4248a646101c5c318dc02e4ca2866d6bc42e84045dbb851555a76", + "sha256:d5b318269652e809b5c30a5fe666c50159ab80bfd41cd6bafe655bf20b29fcba" + ], + "index": "pypi", + "version": "==6.0.1" + }, + "typing-extensions": { + "hashes": [ + "sha256:2cdf80e4e04866a9b3689a51869016d36db0814d84b8d8a568d22781d45d27ed", + "sha256:829704698b22e13ec9eaf959122315eabb370b0884400e9818334d8b677023d9" + ], + "markers": "python_version >= '3.6'", + "version": "==4.0.0" + }, + "typing-inspect": { + "hashes": [ + "sha256:047d4097d9b17f46531bf6f014356111a1b6fb821a24fe7ac909853ca2a782aa", + "sha256:3cd7d4563e997719a710a3bfe7ffb544c6b72069b6812a02e9b414a8fa3aaa6b", + "sha256:b1f56c0783ef0f25fb064a01be6e5407e54cf4a4bf4f3ba3fe51e0bd6dcea9e5" + ], + "version": "==0.7.1" + }, + "urllib3": { + "hashes": [ + "sha256:4987c65554f7a2dbf30c18fd48778ef124af6fab771a377103da0585e2336ece", + "sha256:c4fdf4019605b6e5423637e01bc9fe4daef873709a7973e195ceba0a62bbc844" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4'", + "version": "==1.26.7" + }, + "virtualenv": { + "hashes": [ + "sha256:4b02e52a624336eece99c96e3ab7111f469c24ba226a53ec474e8e787b365814", + "sha256:576d05b46eace16a9c348085f7d0dc8ef28713a2cabaa1cf0aea41e8f12c9218" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==20.10.0" + } + } +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..f1a0a0c --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +atextcrawler is an asynchronous webcrawler indexing text for literal and semantic search. + +Its client-side counterpart is [atextsearch](https://gitea.multiname.org/a-text/atextsearch) + +atextcrawler crawls and indexes selected websites. +It starts from a few seed sites and follows their external links. +Criteria defined in plugin code determine which linked sites (and +which of their resources) are (recursively) added to the pool. + +atextcrawler is written in Python, runs a configurable number of +async workers concurrently (in one process), uses tensorflow for +embedding (paragraph-sized) text chunks in a (multi-)language model +and stores metadata in PostgreSQL and texts in elasticsearch. diff --git a/doc/Makefile b/doc/Makefile new file mode 100644 index 0000000..d0c3cbf --- /dev/null +++ b/doc/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/doc/source/conf.py b/doc/source/conf.py new file mode 100644 index 0000000..db3c026 --- /dev/null +++ b/doc/source/conf.py @@ -0,0 +1,71 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath('.')))) +import os +import sys + +proj_dir = os.path.dirname(os.path.dirname(os.path.abspath('.'))) +sys.path.insert(0, proj_dir + '/src') + + +# -- Project information ----------------------------------------------------- + +project = 'atextcrawler' +copyright = '2021, ibu radempa' +author = 'ibu radempa' + +# The full version, including alpha/beta/rc tags +release = '0.1.0' + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'myst_parser', + 'sphinx.ext.graphviz', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +autosummary_generate = True + +source_suffix = { + '.rst': 'restructuredtext', + '.md': 'markdown', +} diff --git a/doc/source/config_template/initial_data/seed_urls.list b/doc/source/config_template/initial_data/seed_urls.list new file mode 100644 index 0000000..7dc21ea --- /dev/null +++ b/doc/source/config_template/initial_data/seed_urls.list @@ -0,0 +1,23 @@ +# Initial URLs (first run only) +# +# To whitelist a URL prepend '+', to blacklist prepend '-'. +# Comment lines must begin with '#'. + +# de ++http://agd.blogsport.de/ ++https://blackblogs.org/blogs/ ++https://fau.org/ ++http://anarchiv.de/ ++http://olaf.bbm.de/die-aktion +-https://www.anarchistischefoderation.de/ + +# en ++https://anarchistarchivist.com/ ++https://bookshelf.theanarchistlibrary.org/library/ ++https://archive.elephanteditions.net/library/ ++https://blackrosefed.org/ ++https://alpineanarchist.org/ ++https://nostate.net/ ++https://abolishing.blackblogs.org/ ++http://library.nothingness.org/ +-https://www.anarchistfederation.net/ diff --git a/doc/source/config_template/main.yaml b/doc/source/config_template/main.yaml new file mode 100644 index 0000000..8a12feb --- /dev/null +++ b/doc/source/config_template/main.yaml @@ -0,0 +1,88 @@ +# Name of this instance +# Default value: atextcrawler +# Allowed values: arbitrary string +instance_name: atextcrawler + +# Which kind of instance is this? +# Default value: prod +# Allowed values are: +# - 'dev': development instance +# - 'staging': staging instance +# - 'prod': production instance +instance_type: prod + +# Log level +# Default value: info +# Allowed values: critical, error, warning, info, debug +log_level: info + +# Plugins directory +# If given as relative path, it will be relative to the +# directory of this file (main.yaml). +# Read documentation on plugins. +# Default value: plugins +# Hint: Create a empty __init__.py in the plugins_dir. +plugins_dir: plugins + +# Parameters for access to the PostgreSQL service +# No default values; must be set. +postgresql: + host: localhost + port: 5432 + database: atextcrawler + user: atextcrawler + password: ________________________ + +# Crawling +crawl: + # Number of concurrent workers + # Default value: 10 + # Allowed values: integer >=0 and <=1000 + #workers: 3 + + # Delay in seconds between attempts to fetch items + # from site_queue if the last attempt gave no item + # Also the delay in seconds after a worker has found + # no site to process + # Default value: 600 + # Allowed values: positive number + #site_delay: 10 + + # Time interval in seconds between site updates when + # handling queued base URLs + # Default value: 3600 + # Allowed values: positive number + #site_revisit_interval: 3600 + + # Delay in seconds between attempts to process + # individual resources (pages etc.) of a site + # Default value: 5 + # Allowed values: positive number + #resource_delay: 3 + + # Default interval in seconds between full crawls of a site + # Default value: 864000 (10 days) + # Allowed values: positive number + #full_crawl_interval: 864000 + + # Default interval in seconds between feed crawls of a site + # Default value: 86400 (1 day) + # Allowed values: positive number + #feed_crawl_interval: 86400 + +# Parameters for access to the ElasticSearch service +# No default values; must be set. +elasticsearch: + # host on which ES is running + host: localhost + # API key for accessing ES + api_key: "**********************" + # API user id + id: "**********************" + # Index base name (actual index names will have '_text' etc. appended) + index_base_name: atext + +# Tensorflow access +tensorflow: + # The prediction endpoint of the model server's sentence model + model_server_endpoint: http://localhost:9000/v1/models/sentences:predict diff --git a/doc/source/config_template/plugins/__init__.py b/doc/source/config_template/plugins/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/doc/source/config_template/plugins/filter_resource_path.py b/doc/source/config_template/plugins/filter_resource_path.py new file mode 100644 index 0000000..cb84e22 --- /dev/null +++ b/doc/source/config_template/plugins/filter_resource_path.py @@ -0,0 +1,22 @@ +""" +Filter paths found in a resource. + +This plugin implements :func:`rp_filter`. +""" + +from typing import Optional + + +def rp_filter(site, durl) -> Optional[str]: + """ + Adjust or filter found paths (may depend on site). + + To filter out a path (i.e., not add it to table `site_path`) + return None. + """ + path = durl.pwa() + # skip fetching images (linked from a tags; img tags are skipped anyway) + if path.lower().endswith('.jpg') or path.lower().endswith('.png'): + return None + path = path.removesuffix('?amp=1') + return path diff --git a/doc/source/config_template/plugins/filter_site.py b/doc/source/config_template/plugins/filter_site.py new file mode 100644 index 0000000..09b2282 --- /dev/null +++ b/doc/source/config_template/plugins/filter_site.py @@ -0,0 +1,47 @@ +""" +Relevance estimation of sites. + +This plugin implements :func:`site_filter`. +""" + +import re + +from atextcrawler.models import Site + +MIN_RELEVANCE_SCORE = 5 + + +async def site_filter(site: Site) -> bool: + """ + Assess relevance of the site (using language-dependent criteria). + + If the site shall be crawled, return True, else False. + """ + # limit to sites in English or German language + if not set(['de', 'en']) & set(site.langs): + return False + score = 0.0 + for crit_name, weight, langs, crit_re in re_criteria: + if '*' in langs or set(langs) & set(site.langs): + findings = crit_re.findall(site.startpage_text) + if findings: + score += weight * len(findings) + if site.title and crit_re.search(site.title): + score += 4 * weight + if site.description and crit_re.search(site.description): + score += 4 * weight + + # TODO: add criteria for named entities (FdA-IFA, FAU, ...) + + return score >= MIN_RELEVANCE_SCORE + + +re_criteria = { + ( + 'anarch', + 1.0, + ('*',), + re.compile('((? bool: + """ + Per-site path filter. Return whether the path shall be retrieved. + """ + if not robots.can_fetch_url(site.base_url + path): + return False + if 'amusewiki' in site.meta_info.get('generator', '').lower(): + if any( + [ + path.endswith(end) + for end in ('.html', '.epub', '.tex', '.zip', '.pdf') + ] + ): + return False + if '/bbselect?' in path: + return False + return True diff --git a/doc/source/devel/devel.md b/doc/source/devel/devel.md new file mode 100644 index 0000000..18ce86b --- /dev/null +++ b/doc/source/devel/devel.md @@ -0,0 +1,63 @@ +## Setup dev environment +1. You need python 3.9 or later. +1. Have pipenv installed, e.g. like this: Install pip3, e.g. with `apt install python3-pip`. Then `pip3 install --user pipenv` +1. Clone the repo and setup a virtualenv: +``` +cd YOUR_DEV_DIR +git clone ssh://gitea@gitea-ssh.multiname.org:20106/a-text/atextcrawler.git +cd atextcrawler +pipenv install -d +``` + +## Configure the instance +See [installation](installation.md). + +## Run +``` +python -m atextcrawler +``` + +## Logging +Use the configured instance_name (e.g. `atextcrawler_dev`) to select journal messages: +``` +journalctl -ef SYSLOG_IDENTIFIER=atextcrawler_dev +``` + +## Upgrading +Upgrade dev tools: +``` +pre-commit autoupdate +``` + +## Test and clean manually +``` +AIOPGQ_POSTGRESQL="host=127.0.0.1 port=5432 database=atextcrawler-dev user=atextcrawler-dev password=*************" python -W ignore -m unittest discover +mypy --ignore-missing-imports src/atextcrawler +isort src/atextcrawler +black -S -t py37 -l 79 src/atextcrawler +pybetter --exclude B004,B007,B008 src/atextcrawler +interrogate -i -I -m -v src/atextcrawler +``` + +## Release +There are no releases (currently). + +## Useful commands + +### Fetch a resource or a site manually +``` +python -m atextcrawler.resource https://www.katesharpleylibrary.net/ +python -m atextcrawler.site https://www.katesharpleylibrary.net/ +``` + +### SQL +``` +drop table crawl; drop table site_path; drop table resource; drop table site cascade; drop table site_feed; drop table site_link; drop table site_queue; drop table kvs; + +http -j --auth elastic:*********************** -j DELETE http://127.0.0.1:9200/anarchism_text_* + +http -j --auth elastic:*********************** -j GET http://127.0.0.1:9200/_cat/indices + +-- stats: sites, paths, resources +select s.id site_id, s.base_url, spr.n_paths, spr.n_resources, spr.n_chars from site s left join (select sp.site_id, count(sp.path) n_paths, count(r.id) n_resources, sum(r.text_len) n_chars from site_path sp left join resource r on sp.resource_id=r.id group by sp.site_id) spr on spr.site_id=s.id where s.relevant order by s.id; +``` diff --git a/doc/source/devel/related_work.md b/doc/source/devel/related_work.md new file mode 100644 index 0000000..5ecb99d --- /dev/null +++ b/doc/source/devel/related_work.md @@ -0,0 +1,64 @@ +## Related work +* [collection of crawlers](https://github.com/adbar/awesome-crawler) +* [collection of webscrapers](https://github.com/adbar/awesome-web-scraper) + +### crawlers +* [acrawler](https://acrawler.readthedocs.io/en/latest/) +* [trafilatura](https://trafilatura.readthedocs.io/en/latest/index.html) + * [repo](https://github.com/adbar/trafilatura) + * [intro](https://adrien.barbaresi.eu/blog/trafilatura-main-text-content-python.html) +* [aiohttp_spider](https://github.com/niklak/aiohttp_spider/) +* [scrapy](https://docs.scrapy.org/en/latest/) +* [heritrix3](https://github.com/internetarchive/heritrix3/) +* [YaCy](https://yacy.net/) +* [searchmysite](https://searchmysite.net/) +* [spiderling](http://corpus.tools/raw-attachment/wiki/Downloads/spiderling-src-0.84.tar.xz) +* [aiohttp_spider](https://github.com/niklak/aiohttp_spider) +* https://github.com/riteshnaik/Crawling-and-Deduplication-of-Polar-Datasets-Using-Nutch-and-Tika +* [edge search engine](https://memex.marginalia.nu/projects/edge/about.gmi) + +#### general +* [elastic enterprise search](https://www.elastic.co/blog/building-a-scalable-easy-to-use-web-crawler-for-elastic-enterprise-search) + +### sitemap parsers +* [ultimate-sitemap-parser](https://github.com/mediacloud/ultimate-sitemap-parser) + +### url handling +* [courlan](https://pypi.org/project/courlan/) + +### language detection +* [overview](https://stackoverflow.com/questions/39142778/python-how-to-determine-the-language) +* [guess_language-spirit](https://pypi.org/project/guess_language-spirit/) +* [guess_language](https://pypi.org/project/guess-language/) +* [cld3](https://github.com/google/cld3) + +### text extraction +* [JusText](http://corpus.tools/wiki/Justext_changelog) [demo](https://nlp.fi.muni.cz/projects/justext/) + +### deduplication +* [PostgreSQL extension smlar](https://github.com/jirutka/smlar) +* [use smlar](https://medium.datadriveninvestor.com/the-smlar-plug-in-for-effective-retrieval-of-massive-volumes-of-simhash-data-e429c19da1a3) +* remove paragraphs with more than 50% word-7-tuples encountered previously + +### Extract more meta tags +* https://github.com/shareaholic/shareaholic-api-docs/blob/master/shareaholic_meta_tags.md + https://support.shareaholic.com/hc/en-us/articles/115003085186 + +### Date parsing dependent on language +* https://en.wikipedia.org/wiki/Date_format_by_country +* https://en.wikipedia.org/wiki/Common_Locale_Data_Repository +* https://pypi.org/project/dateparser/ +* https://github.com/ovalhub/pyicu +* https://github.com/night-crawler/cldr-language-helpers +* https://stackoverflow.com/questions/19927654/using-dateutil-parser-to-parse-a-date-in-another-language + +ICU +* https://unicode-org.github.io/icu/userguide/format_parse/datetime/examples.html#parse +* https://gist.github.com/dpk/8325992 +* https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1DateFormat.html +* https://unicode-org.github.io/icu/userguide/ +* https://unicode-org.github.io/icu-docs/#/icu4c/ +* https://github.com/ovalhub/pyicu/blob/master/samples/break.py +* https://www.unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table +* https://www.unicode.org/reports/tr35/tr35-dates.html#months_days_quarters_eras +* https://unicode-org.github.io/icu/userguide/format_parse/datetime/#formatting-dates-and-times-overview diff --git a/doc/source/devel/todo.md b/doc/source/devel/todo.md new file mode 100644 index 0000000..35e8065 --- /dev/null +++ b/doc/source/devel/todo.md @@ -0,0 +1,77 @@ +## TODO + +* parse html time tags + +* site annotations: + * categories + * historical (no changes any more since n months) + * news + * local focus - geonames: http://download.geonames.org/export/dump/cities15000.zip + +* allow for tls in elasticsearch config + +* replace dashes, dots and quotes: https://github.com/kovidgoyal/calibre/blob/3dd95981398777f3c958e733209f3583e783b98c/src/calibre/utils/unsmarten.py +``` + '–': '--', + '–': '--', + '–': '--', + '—': '---', + '—': '---', + '—': '---', + '…': '...', + '…': '...', + '…': '...', + '“': '"', + '”': '"', + '„': '"', + '″': '"', + '“': '"', + '”': '"', + '„': '"', + '″': '"', + '“':'"', + '”':'"', + '„':'"', + '″':'"', + '‘':"'", + '’':"'", + '′':"'", + '‘':"'", + '’':"'", + '′':"'", + '‘':"'", + '’':"'", + '′':"'", +``` +* normalize quotation marks and punctuation in general + * https://unicode-table.com/en/sets/quotation-marks/ + * https://github.com/avian2/unidecode/blob/master/unidecode/x020.py + * https://www.fileformat.info/info/unicode/category/Po/list.htm + * https://www.gaijin.at/en/infos/unicode-character-table-punctuation + * ⁝ + +* cancel crawls that take too long + +* search for "TODO" in code + +* feedparser has support for JSON feeds since commit + a5939702b1fd0ec75d2b586255ff0e29e5a8a6fc + (as of 2020-10-26 in "develop" branch, not part of a release) + the version names are 'json1' and 'json11' + +* allow site URLs with path, e.g. + https://web.archive.org/web/20090320055457/http://www.geocities.com/kk_abacus/ + +* add more languages + +## Ideas +* use [python-libzim](https://github.com/openzim/python-libzim) to create ZIM archives + +* [space-langdetect](https://pypi.org/project/spacy-langdetect/) +* [langid.py](https://github.com/saffsd/langid.py) + +* [gain](https://github.com/gaojiuli/gain) +* [ruia](https://docs.python-ruia.org/) +* [demiurge](https://demiurge.readthedocs.io/) +* [cocrawler](https://github.com/cocrawler/cocrawler/) +* [aiocrawler](https://github.com/tapanpandita/aiocrawler/) diff --git a/doc/source/development.rst b/doc/source/development.rst new file mode 100644 index 0000000..befa427 --- /dev/null +++ b/doc/source/development.rst @@ -0,0 +1,9 @@ +Development +----------- + +.. toctree:: + :maxdepth: 2 + + devel/devel + devel/todo + devel/related_work diff --git a/doc/source/elasticsearch.md b/doc/source/elasticsearch.md new file mode 100644 index 0000000..7ccae9d --- /dev/null +++ b/doc/source/elasticsearch.md @@ -0,0 +1,119 @@ +# Howto elasticsearch + +## Prerequisites +On the host (virtualization host) we need: +``` +# cat /etc/sysctl.d/virtual_memory.conf +vm.max_map_count=262144 +# sysctl -p /etc/sysctl.d/virtual_memory.conf +``` + +If this cannot be done, change this file after installing or upgrading elasticsearch: +``` +/usr/lib/sysctl.d/elasticsearch.conf +``` + +## Setup + +### Install package +In general, see the [elaticsearch reference](https://www.elastic.co/guide/en/elasticsearch/reference/7.10/deb.html). + +We do a manual install. If you configure the apt repo instead, also think about setting +`RESTART_ON_UPGRADE=true` in `/etc/default/elasticsearch`. + +``` +wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb +wget https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-7.15.2-amd64.deb.sha512 +shasum -a 512 -c elasticsearch-7.15.2-amd64.deb.sha512 +dpkg -i elasticsearch-7.15.2-amd64.deb +systemctl daemon-reload +systemctl enable elasticsearch.service +systemctl start elasticsearch.service +``` + +First test: +``` +http -j GET 127.0.0.1:9200/ +``` + +### Storage + +``` +systemctl stop elasticsearch.service +mv /var/lib/elasticsearch/ /srv/ +systemctl start elasticsearch.service +``` + +Edit /etc/elasticsearch/elasticsearch.yml +``` +cluster.name: org.a-text.search +node.name: atext1 +path.data: /srv/elasticsearch +path.logs: /var/log/elasticsearch +discovery.seed_hosts: ["atext1.multiname.org"] +xpack.security.enabled: true +xpack.security.authc.api_key.enabled: true +``` + +``` +systemctl restart elasticsearch +``` + +The logfile now is at +``` +/var/log/elasticsearch/org.a-text.search.log +``` + +### Setup passwords +Setup passwords: +``` +# /usr/share/elasticsearch/bin/elasticsearch-setup-passwords auto +Initiating the setup of passwords for reserved users elastic,apm_system,kibana,kibana_system,logstash_system,beats_system,remote_monitoring_user. +The passwords will be randomly generated and printed to the console. +Please confirm that you would like to continue [y/N]y +``` + +Copy output to /etc/elasticsearch/passwords and +``` +chmod 400 /etc/elasticsearch/passwords +``` + +Check login as user elastic: +``` +http --auth elastic:************** -j GET http://127.0.0.1:9200/ +``` + +### Memory limitation +To limit memory usage +``` +mkdir /etc/systemd/system/elasticsearch.service.d +cat >/etc/systemd/system/elasticsearch.service.d/override.conf <>.bashrc <>.profile < 0 and self.running: + await asyncio.sleep(min(t_slice, remaining)) + remaining -= t_slice + + +async def reset_site_locks(pool): + """ + Remove locks leftover from last run: Set crawl_active=false for all sites. + + This is relevant when the application was not shutdown properly (e.g. + when the process was killed). + """ + async with pool.acquire() as conn: + sql = "UPDATE site SET crawl_active = false WHERE crawl_active = true" + await conn.execute(sql) diff --git a/src/atextcrawler/assets/iana_langs b/src/atextcrawler/assets/iana_langs new file mode 100644 index 0000000..33687b8 --- /dev/null +++ b/src/atextcrawler/assets/iana_langs @@ -0,0 +1,7 @@ +The recommended language tags to use in webpages are from +the IANA Language Subtag Registry (BCP47), see: +https://www.w3.org/International/questions/qa-html-language-declarations +https://r12a.github.io/app-subtags/ + + +wget -O- https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | rg '^Subtag: |^Tag: ' atextcrawler/assets/iana_langs_ | sed -e 's/^Subtag: //' | sed -e 's/^Tag: //' diff --git a/src/atextcrawler/assets/iso_639-1 b/src/atextcrawler/assets/iso_639-1 new file mode 100644 index 0000000..373b69a --- /dev/null +++ b/src/atextcrawler/assets/iso_639-1 @@ -0,0 +1,219 @@ +aa +ab +ae +af +ak +am +an +ar +as +av +ay +az +ba +be +bg +bh +bi +bm +bn +bo +br +bs +ca +ca +ce +ch +co +cr +cs +cu +cu +cu +cu +cu +cv +cy +da +de +dv +dv +dv +dz +ee +el +en +eo +es +es +et +eu +fa +ff +fi +fj +fo +fr +fy +ga +gd +gd +gl +gn +gu +gv +ha +he +hi +ho +hr +ht +ht +hu +hy +hz +ia +id +ie +ie +ig +ii +ii +ik +io +is +it +iu +ja +jv +ka +kg +ki +ki +kj +kj +kk +kl +kl +km +kn +ko +kr +ks +ku +kv +kw +ky +ky +la +lb +lb +lg +li +li +li +ln +lo +lt +lu +lv +mg +mh +mi +mk +ml +mn +mr +ms +mt +my +na +nb +nb +nd +nd +ne +ng +nl +nl +nn +nn +no +nr +nr +nv +nv +ny +ny +ny +oc +oj +om +or +os +os +pa +pa +pi +pl +ps +ps +pt +qu +rm +rn +ro +ro +ro +ru +rw +sa +sc +sd +se +sg +si +si +sk +sl +sm +sn +so +sq +sr +ss +st +su +sv +sw +ta +te +tg +th +ti +tk +tl +tn +to +tr +ts +tt +tw +ty +ug +ug +uk +ur +uz +ve +vi +vo +wa +wo +xh +yi +yo +za +za +zh +zu diff --git a/src/atextcrawler/assets/top_1e4 b/src/atextcrawler/assets/top_1e4 new file mode 100644 index 0000000..2c7dbf0 --- /dev/null +++ b/src/atextcrawler/assets/top_1e4 @@ -0,0 +1,10000 @@ +google.com +facebook.com +youtube.com +microsoft.com +twitter.com +tmall.com +instagram.com +windowsupdate.com +qq.com +linkedin.com +baidu.com +apple.com +wikipedia.org +netflix.com +live.com +sohu.com +doubleclick.net +amazon.com +yahoo.com +googletagmanager.com +taobao.com +youtu.be +adobe.com +pinterest.com +360.cn +vimeo.com +bing.com +reddit.com +jd.com +wordpress.com +office.com +weibo.com +zoom.us +googleusercontent.com +goo.gl +github.com +sina.com.cn +amazonaws.com +bit.ly +microsoftonline.com +xinhuanet.com +wordpress.org +google-analytics.com +blogspot.com +vk.com +fbcdn.net +tumblr.com +mozilla.org +msn.com +nytimes.com +whatsapp.com +flickr.com +europa.eu +gravatar.com +dropbox.com +cnn.com +ytimg.com +alipay.com +okezone.com +myshopify.com +soundcloud.com +nih.gov +panda.tv +medium.com +t.co +apache.org +skype.com +ebay.com +csdn.net +zhanqi.tv +w3.org +twitch.tv +yahoo.co.jp +spotify.com +forbes.com +theguardian.com +paypal.com +googlevideo.com +google.com.hk +office365.com +imdb.com +sourceforge.net +bbc.co.uk +aliexpress.com +googleadservices.com +macromedia.com +googlesyndication.com +archive.org +bongacams.com +naver.com +bbc.com +github.io +digicert.com +cloudflare.com +stackoverflow.com +weebly.com +yandex.ru +gvt2.com +creativecommons.org +amazon.co.jp +wixsite.com +amazon.in +issuu.com +washingtonpost.com +who.int +virginmedia.com +windows.net +tianya.cn +imgur.com +icloud.com +huanqiu.com +app-measurement.com +ggpht.com +aparat.com +etsy.com +yy.com +oracle.com +cdc.gov +akadns.net +reuters.com +google.co.in +mail.ru +chaturbate.com +163.com +php.net +google.de +wix.com +tinyurl.com +slideshare.net +tribunnews.com +so.com +godaddy.com +amazon.co.uk +akamaiedge.net +wikimedia.org +wsj.com +windows.com +businessinsider.com +bloomberg.com +forms.gle +youtube-nocookie.com +17ok.com +sciencedirect.com +opera.com +alibaba.com +cnet.com +ok.ru +outlook.com +wp.com +harvard.edu +google.co.jp +mit.edu +gnu.org +dailymail.co.uk +1688.com +opendns.com +google.com.br +researchgate.net +pornhub.com +ibm.com +go.com +usatoday.com +list-manage.com +ntp.org +blogger.com +fandom.com +t.me +stanford.edu +cnbc.com +wiley.com +rakuten.co.jp +hicloud.com +livejasmin.com +indeed.com +samsung.com +espn.com +office.net +aol.com +bitly.com +google.co.uk +hp.com +telegraph.co.uk +facebook.net +amazon.de +canva.com +cnblogs.com +aaplimg.com +fb.com +sogou.com +hao123.com +walmart.com +surveymonkey.com +nature.com +tiktok.com +booking.com +npr.org +nasa.gov +cpanel.net +identrust.com +foxnews.com +nginx.org +tradingview.com +eventbrite.com +dailymotion.com +un.org +cloudfront.net +time.com +haosou.com +indiatimes.com +msedge.net +babytree.com +xvideos.com +force.com +myspace.com +ca.gov +salesforce.com +slack.com +bilibili.com +aliyun.com +huffingtonpost.com +behance.net +addthis.com +www.gov.uk +udemy.com +springer.com +apple-dns.net +ted.com +google.fr +google.es +roblox.com +kompas.com +doi.org +cpanel.com +sharepoint.com +google.cn +wired.com +yelp.com +detik.com +chase.com +zillow.com +nginx.com +youku.com +mysql.com +google.it +zhihu.com +freepik.com +google.ru +hugedomains.com +scorecardresearch.com +instructure.com +independent.co.uk +akamaihd.net +flipkart.com +stackexchange.com +thestartmagazine.com +scribd.com +goodreads.com +themeforest.net +amazon-adsystem.com +akamaized.net +healthline.com +techcrunch.com +zendesk.com +gome.com.cn +mailchimp.com +debian.org +wa.me +squarespace.com +daum.net +latimes.com +gmail.com +free.fr +intel.com +telegram.org +unsplash.com +6.cn +shutterstock.com +berkeley.edu +adnxs.com +line.me +tripadvisor.com +grammarly.com +wetransfer.com +googletagservices.com +statcounter.com +livejournal.com +addtoany.com +wikihow.com +android.com +taboola.com +akamai.net +msftncsi.com +xhamster.com +gvt1.com +duckduckgo.com +giphy.com +amzn.to +savefrom.net +theverge.com +amazon.ca +netflix.net +ikea.com +google.ca +pixnet.net +ettoday.net +webmd.com +mediafire.com +intuit.com +speedtest.net +twimg.com +pixabay.com +sitemaps.org +kickstarter.com +ft.com +craigslist.org +cbsnews.com +irs.gov +beian.gov.cn +quora.com +whatsapp.net +google.com.sg +nbcnews.com +theatlantic.com +nationalgeographic.com +okta.com +loc.gov +usps.com +investopedia.com +cornell.edu +w3schools.com +ietf.org +marketwatch.com +arnebrachhold.de +cisco.com +washington.edu +digg.com +criteo.com +rednet.cn +shopify.com +zoho.com +deviantart.com +miit.gov.cn +bestbuy.com +adsrvr.org +rubiconproject.com +appsflyer.com +jimdo.com +webex.com +weather.com +digikala.com +padlet.com +wellsfargo.com +primevideo.com +about.com +google.com.tw +tiktokv.com +tandfonline.com +dell.com +nflxso.net +akismet.com +trello.com +onlinesbi.com +huffpost.com +pubmatic.com +buzzfeed.com +crashlytics.com +usnews.com +marriott.com +launchpad.net +fda.gov +azureedge.net +prnewswire.com +cambridge.org +discord.com +tistory.com +ups.com +disqus.com +princeton.edu +pikiran-rakyat.com +typepad.com +cnzz.com +uol.com.br +globo.com +stumbleupon.com +mailchi.mp +pinimg.com +ampproject.org +ilovepdf.com +avito.ru +2mdn.net +economist.com +hulu.com +demdex.net +mashable.com +statista.com +tiktokcdn.com +nypost.com +tokopedia.com +sciencemag.org +alicdn.com +academia.edu +msftconnecttest.com +bandcamp.com +coursera.org +whitehouse.gov +hubspot.com +change.org +youronlinechoices.com +nflximg.com +pbs.org +box.com +constantcontact.com +huawei.com +pki.goog +outbrain.com +rt.com +oup.com +target.com +51.la +patreon.com +soso.com +liputan6.com +feedburner.com +symantec.com +airbnb.com +youm7.com +ebay.de +advertising.com +fedex.com +google.com.vn +google.com.mx +trustpilot.com +aboutads.info +webs.com +rlcdn.com +steampowered.com +evernote.com +investing.com +casalemedia.com +homedepot.com +chess.com +openx.net +jquery.com +unesco.org +fc2.com +mozilla.com +sagepub.com +state.gov +disneyplus.com +amazon.fr +usda.gov +plesk.com +columbia.edu +grid.id +google.com.tr +varzesh3.com +eepurl.com +vice.com +arcgis.com +avast.com +umich.edu +hbr.org +moatads.com +teamviewer.com +britannica.com +mayoclinic.org +allaboutcookies.org +accuweather.com +cbc.ca +engadget.com +redhat.com +abc.net.au +hdfcbank.com +lazada.sg +gofundme.com +smallpdf.com +pexels.com +fiverr.com +hola.org +networkadvertising.org +business.site +dribbble.com +noaa.gov +51sole.com +psu.edu +tripod.com +vox.com +epa.gov +zdnet.com +geocities.com +bidswitch.net +setn.com +coinmarketcap.com +meetup.com +ebay.co.uk +google.co.th +yale.edu +bet9ja.com +amazon.es +sciencedaily.com +sun.com +worldometers.info +kumparan.com +metropoles.com +nist.gov +hootsuite.com +guardian.co.uk +heavy.com +getpocket.com +americanexpress.com +elsevier.com +example.com +newyorker.com +aliexpress.ru +wayfair.com +businesswire.com +dw.com +iqiyi.com +google.pl +upenn.edu +iso.org +breitbart.com +worldbank.org +sindonews.com +fidelity.com +gizmodo.com +apnews.com +nvidia.com +gotowebinar.com +cdninstagram.com +momoshop.com.tw +amazon.it +redd.it +fastcompany.com +typeform.com +psychologytoday.com +photobucket.com +wpengine.com +vkontakte.ru +merriam-webster.com +nike.com +ieee.org +telewebion.com +steamcommunity.com +byteoversea.com +edgekey.net +everesttech.net +appcenter.ms +snapchat.com +jpnn.com +att.com +ox.ac.uk +bluekai.com +umn.edu +entrepreneur.com +xnxx.com +patch.com +oreilly.com +inc.com +politico.com +theepochtimes.com +fortune.com +elpais.com +suara.com +binance.com +linktr.ee +plos.org +gamepedia.com +google.com.ar +tudou.com +google.com.au +hotjar.com +uci.edu +cdstm.cn +ed.gov +canada.ca +mathtag.com +google.co.id +wisc.edu +aliyuncs.com +jhu.edu +deloitte.com +afternic.com +theconversation.com +wiktionary.org +quantserve.com +capitalone.com +upwork.com +epicgames.com +newsweek.com +python.org +spiegel.de +deepl.com +sfgate.com +scientificamerican.com +newrelic.com +chicagotribune.com +bankofamerica.com +google.com.sa +agkn.com +ask.com +ucla.edu +bbb.org +mercadolivre.com.br +elegantthemes.com +mercadolibre.com.mx +namnak.com +wikia.com +douban.com +sfx.ms +slate.com +ny.gov +g.page +quizlet.com +yimg.com +chinadaily.com.cn +bmj.com +adp.com +glassdoor.com +fb.me +feedly.com +manoramaonline.com +360.com +ndtv.com +www.gov.cn +ubuntu.com +xfinity.com +appspot.com +weforum.org +live.net +arxiv.org +medicalnewstoday.com +timeanddate.com +bukalapak.com +zol.com.cn +qualtrics.com +ameblo.jp +cmu.edu +utexas.edu +chouftv.ma +icicibank.com +realtor.com +indiegogo.com +criteo.net +tencent.com +zerodha.com +cam.ac.uk +crwdcntrl.net +lenovo.com +pcmag.com +chron.com +oecd.org +biomedcentral.com +cbslocal.com +playstation.com +discordapp.com +thesun.co.uk +uk.com +aboutcookies.org +khanacademy.org +mirror.co.uk +nr-data.net +softonic.com +telegram.me +y2mate.com +arstechnica.com +bootstrapcdn.com +bet365.com +google.com.eg +apa.org +nps.gov +uchicago.edu +qz.com +ign.com +e2ro.com +bizjournals.com +orange.fr +uiuc.edu +hilton.com +cnnic.cn +unity3d.com +adform.net +dns.google +howstuffworks.com +tapad.com +zaloapp.com +dropcatch.com +merdeka.com +ftc.gov +iqbroker.com +mercadolibre.com.ar +fastly.net +asus.com +fontawesome.com +verisign.com +verizon.com +istockphoto.com +us.com +crunchyroll.com +messenger.com +bitnami.com +nba.com +purdue.edu +nicovideo.jp +adsafeprotected.com +fao.org +comodoca.com +parallels.com +si.edu +news.com.au +house.gov +xing.com +jotform.com +over-blog.com +techradar.com +mckinsey.com +visualstudio.com +krxd.net +openstreetmap.org +etoro.com +ea.com +nydailynews.com +smh.com.au +namu.wiki +reverso.net +nikkei.com +moneycontrol.com +spotxchange.com +autodesk.com +netscape.com +asos.com +mgid.com +kakao.com +barnesandnoble.com +usc.edu +earthlink.net +nyu.edu +thehill.com +shaparak.ir +gosuslugi.ru +instructables.com +umeng.com +livescience.com +3lift.com +pnas.org +toutiao.com +abs-cbn.com +pewresearch.org +jstor.org +altervista.org +trafficmanager.net +googleblog.com +blogspot.co.uk +op.gg +idntimes.com +turn.com +inquirer.net +hotstar.com +thesaurus.com +ning.com +senate.gov +lifehacker.com +eastday.com +norton.com +enable-javascript.com +joomla.org +aljazeera.com +azure.com +thepiratebay.org +variety.com +rambler.ru +allegro.pl +thedailybeast.com +geeksforgeeks.org +sectigo.com +jrj.com.cn +uber.com +census.gov +dictionary.com +discord.gg +azurewebsites.net +sberbank.ru +proiezionidiborsa.it +jsdelivr.net +xiaomi.com +viva.co.id +stripe.com +secureserver.net +sakura.ne.jp +wildberries.ru +lijit.com +duke.edu +vmware.com +thelancet.com +exelator.com +ow.ly +rollingstone.com +mookie1.com +prezi.com +venturebeat.com +ladbible.com +trendyol.com +google.co.kr +icio.us +usgs.gov +acs.org +oppomobile.com +amap.com +gmw.cn +fbsbx.com +ufl.edu +fool.com +sky.com +branch.io +gstatic.com +postgresql.org +atlassian.com +alwafd.news +costco.com +gartner.com +thefreedictionary.com +sonhoo.com +express.co.uk +northwestern.edu +zend.com +shopee.co.id +getbootstrap.com +msu.edu +foursquare.com +genius.com +google.com.ua +withgoogle.com +proofpoint.com +imageshack.us +umd.edu +angelfire.com +web.de +brilio.net +ucsd.edu +urbandictionary.com +9gag.com +nhk.or.jp +about.me +jianshu.com +ameritrade.com +talktalk.co.uk +cctv.com +seekingalpha.com +letsencrypt.org +lemonde.fr +td.com +gitlab.com +themeisle.com +citi.com +patria.org.ve +jamanetwork.com +sahibinden.com +mzstatic.com +wish.com +newegg.com +scmp.com +hatena.ne.jp +namecheap.com +rottentomatoes.com +ria.ru +bls.gov +lowes.com +utoronto.ca +spankbang.com +docker.com +sgsnssdk.com +digitaltrends.com +tiktokcdn-in.com +hatenablog.com +efu.com.cn +naukri.com +people.com +dhl.com +thetimes.co.uk +hhs.gov +alexa.com +namasha.com +teads.tv +unc.edu +duolingo.com +slashdot.org +gismeteo.ru +googleapis.com +theglobeandmail.com +hm.com +adjust.com +gmx.net +heytapmobi.com +itu.int +smartadserver.com +doubleverify.com +ssl-images-amazon.com +privacyshield.gov +dotomi.com +nejm.org +mama.cn +apachefriends.org +mixcloud.com +nfl.com +cricbuzz.com +squareup.com +kaspersky.com +mixpanel.com +360doc.com +shopee.vn +schwab.com +sec.gov +shopee.tw +boston.com +rctiplus.com +bhphotovideo.com +today.com +history.com +google.gr +moodle.org +ouedkniss.com +mitre.org +media.net +1rx.io +ebay.com.au +google.nl +hollywoodreporter.com +w.org +gamespot.com +zalo.me +mi.com +newscientist.com +myqcloud.com +donya-e-eqtesad.com +wufoo.com +dedecms.com +openssl.org +creativecdn.com +adobe.io +m.me +kapanlagi.com +woocommerce.com +xe.com +freebsd.org +siemens.com +xbox.com +zara.com +wunderground.com +focus.cn +qpic.cn +eff.org +marca.com +www.nhs.uk +omtrdc.net +edgesuite.net +illinois.edu +in.gr +java.com +nymag.com +calendly.com +bitbucket.org +indiamart.com +drupal.org +albawabhnews.com +iyiou.com +videocampaign.co +arizona.edu +indianexpress.com +google.ro +techtarget.com +onlyfans.com +dcard.tw +kernel.org +ebay-kleinanzeigen.de +unicef.org +dbs.com.sg +thegatewaypundit.com +moz.com +ucdavis.edu +asu.edu +macys.com +wattpad.com +metro.co.uk +colorado.edu +agafurretor.com +livestream.com +51yes.com +iana.org +amazon.com.mx +wp.me +automattic.com +espncricinfo.com +biblegateway.com +adsymptotic.com +repubblica.it +wa.gov +eastmoney.com +libsyn.com +dmoz.org +icann.org +gsmarena.com +linksynergy.com +borna.news +lefigaro.fr +coinbase.com +yts.mx +technologyreview.com +foodnetwork.com +rutgers.edu +go-mpulse.net +fast.com +licdn.com +mapquest.com +gotomeeting.com +ensonhaber.com +ustream.tv +oschina.net +spb.ru +thenextweb.com +redfin.com +sony.com +ozon.ru +uspto.gov +zippyshare.com +incometaxindiaefiling.gov.in +gallup.com +qoo10.sg +businessinsider.de +rediff.com +alodokter.com +shopee.com.my +banvenez.com +ibicn.com +w55c.net +t-online.de +ih5.cn +onenote.net +archives.gov +mega.nz +cbssports.com +sba.gov +livedoor.jp +sputniknews.com +discogs.com +pinterest.ca +justice.gov +ubc.ca +tutorialspoint.com +audible.com +phys.org +kakaku.com +wordreference.com +opensource.org +gusuwang.com +howtogeek.com +ssrn.com +dot.gov +vnexpress.net +eset.com +sap.com +matterport.com +rezka.ag +liansuo.com +hbomax.com +mystrikingly.com +elbalad.news +teachable.com +shutterfly.com +custhelp.com +runoob.com +frontier.com +miui.com +simpli.fi +1337x.to +xboxlive.com +ninisite.com +bluehost.com +rfihub.com +mdpi.com +filimo.com +contextweb.com +rs6.net +myworkdayjobs.com +dbankcloud.com +zadn.vn +mlb.com +virginia.edu +samsungapps.com +prweb.com +wowhead.com +last.fm +hindustantimes.com +businessweek.com +usa.gov +sina.cn +chinanews.com +adweek.com +taleo.net +edx.org +globalsign.com +salon.com +tremorhub.com +fbi.gov +beytoote.com +envato.com +hotmart.com +pwc.com +yumpu.com +uptodown.com +sitescout.com +osu.edu +garmin.com +divar.ir +podbean.com +google.com.my +euronews.com +nsw.gov.au +sq.cn +as.com +samsungcloud.com +redbubble.com +list-manage1.com +admin.ch +goo.ne.jp +bidr.io +acm.org +discovery.com +blackboard.com +wp.pl +zemanta.com +adobedtm.com +va.gov +rakuten.com +imrworldwide.com +ivi.ru +unrulymedia.com +bu.edu +meituan.com +colorlib.com +lonelyplanet.com +slickdeals.net +viber.com +paypal.me +kde.org +rbc.ru +themegrill.com +secomtrust.net +hurriyet.com.tr +ap.org +frontiersin.org +is.gd +google.co.ve +readthedocs.io +ethz.ch +qlogo.cn +westernjournal.com +ecosia.org +thoughtco.com +boc.cn +turkiye.gov.tr +biobiochile.cl +fpt.vn +nokia.com +accenture.com +fcc.gov +pastebin.com +pandora.com +globenewswire.com +libero.it +bostonglobe.com +wustl.edu +jiathis.com +smithsonianmag.com +t-mobile.com +onesignal.com +lego.com +tamu.edu +flic.kr +shopeemobile.com +infobae.com +dhs.gov +consumerreports.org +onet.pl +kontan.co.id +dropboxusercontent.com +perl.org +phpbb.com +bigcartel.com +mcafee.com +shein.com +anchor.fm +notion.so +lencr.org +amd.com +faqs.org +weather.gov +google.be +pcworld.com +elwatannews.com +nordstrom.com +atdmt.com +bild.de +cnnindonesia.com +tmz.com +avg.com +qcloud.com +usembassy.gov +zing.vn +iheart.com +mercari.com +ey.com +vanityfair.com +indexww.com +atlassian.net +hostgator.com +msnbc.com +fedoraproject.org +dmm.co.jp +yolasite.com +3gppnetwork.org +liadm.com +treasury.gov +netdna-ssl.com +dianping.com +hexun.com +smugmug.com +voanews.com +strava.com +imf.org +udn.com +emxdgt.com +theregister.co.uk +seznam.cz +narod.ru +com.com +rfc-editor.org +thawte.com +foxbusiness.com +google.ch +discover.com +lg.com +dafont.com +it168.com +ucl.ac.uk +4shared.com +irctc.co.in +500px.com +garena.com +oxfordjournals.org +justdial.com +att.net +medscape.com +myworkday.com +elsevierhealth.com +premierleague.com +globalsign.net +indiana.edu +58.com +firefox.com +coupang.com +cnbcindonesia.com +google.com.ph +lenta.ru +gutenberg.org +computerworld.com +billboard.com +snopes.com +greenpeace.org +army.mil +elmundo.es +asahi.com +mehrnews.com +expedia.com +amazon.cn +tufts.edu +ibb.co +amazon.com.au +optimizely.com +sharethrough.com +thehindu.com +utah.edu +brookings.edu +youporn.com +schneier.com +delicious.com +cancer.org +google.co.za +dol.gov +media-amazon.com +usertrust.com +icloud-content.com +ncsu.edu +dreamstime.com +thestar.com +myanimelist.net +banggood.com +doordash.com +google.at +liveinternet.ru +broadcom.com +congress.gov +bund.de +houzz.com +seattletimes.com +novell.com +hyatt.com +pixiv.net +cpan.org +udel.edu +heart.org +japanpost.jp +eatthis.com +shareasale.com +ultimate-guitar.com +mercurynews.com +neilpatel.com +cnbeta.com +lww.com +gettyimages.com +mozilla.net +filesusr.com +prestashop.com +allrecipes.com +rutracker.org +msecnd.net +xiaomi.net +mathrubhumi.com +crunchbase.com +heycould.com +nasdaq.com +pitt.edu +texas.gov +schoology.com +bugsnag.com +herokuapp.com +tsetmc.com +tejaratnews.com +cia.gov +nielsen.com +app.link +energy.gov +gumgum.com +ico.org.uk +navy.mil +freshdesk.com +trustarc.com +staticflickr.com +truste.com +nintendo.com +leboncoin.fr +iherb.com +eba.gov.tr +youdao.com +undp.org +globalnews.ca +welt.de +standard.co.uk +tinypic.com +nyc.gov +google.com.pk +semanticscholar.org +digitalocean.com +eurekalert.org +pchome.com.tw +renren.com +brightcove.net +disney.com +impress.co.jp +me.com +ads-twitter.com +ctvnews.ca +naver.jp +ihg.com +goodhousekeeping.com +yahoodns.net +g.co +collegeboard.org +wikidot.com +ssa.gov +thebalance.com +haxx.se +chegg.com +aarp.org +ucweb.com +cygwin.com +google.cl +cuny.edu +chinaz.com +yandex.com +adblockplus.org +washingtontimes.com +reference.com +typekit.net +mileroticos.com +clarin.com +ntp.org.cn +remove.bg +fifa.com +ytmp3.cc +panasonic.com +domaintools.com +blogspot.ca +theregister.com +vidio.com +irishtimes.com +ucsb.edu +ecer.com +ed.ac.uk +uidai.gov.in +stickyadstv.com +depositphotos.com +battle.net +nhl.com +pippio.com +answers.com +citrix.com +cnki.net +pearson.com +huaban.com +tiny.cc +cdn-apple.com +ibtimes.com +symcd.com +bleacherreport.com +imageshack.com +hubpages.com +wondershare.com +freelancer.com +philips.com +accor.com +google.az +buzzfeednews.com +deezer.com +w3school.com.cn +homestead.com +aweber.com +jimdofree.com +mopub.com +house365.com +thingiverse.com +biglobe.ne.jp +france24.com +vrbo.com +ehow.com +farfetch.com +weibo.cn +athemes.com +medlineplus.gov +tesla.com +payu.in +flipboard.com +prothomalo.com +blog.google +gatech.edu +ancestry.com +openldap.org +semrush.com +coindesk.com +ew.com +104.com.tw +junwonsil.com +pcbaby.com.cn +utorrent.com +nbcsports.com +usbank.com +service.gov.uk +nownews.com +videolan.org +youth.cn +minecraft.net +ebc.net.tw +chsi.com.cn +heise.de +insider.com +nobelprize.org +deadline.com +hotmail.com +000webhostapp.com +securityfocus.com +codecademy.com +msdn.com +flaticon.com +cbs.com +japantimes.co.jp +blogspot.de +coe.int +lifewire.com +amazontrust.com +cancer.gov +shopee.co.th +hawaii.edu +chinanetrank.com +binomo.com +digital.com +vt.edu +oregonstate.edu +cell.com +softpedia.com +royalbank.com +nerdwallet.com +livedoor.com +georgetown.edu +realsrv.com +boredpanda.com +flurry.com +mtv.com +youronlinechoices.eu +redcross.org +interia.pl +thomsonreuters.com +olympic.org +cosmopolitan.com +gimp.org +redbull.com +1drv.ms +shopee.ph +ccc.de +bttrack.com +docusign.net +kuronekoyamato.co.jp +spreaker.com +denverpost.com +lastpass.com +coursehero.com +timeout.com +superuser.com +yoox.com +dspultra.com +coloros.com +ovh.net +startribune.com +india.com +uscis.gov +mac.com +pdfdrive.com +zeit.de +groupon.com +babyschool.com.cn +caltech.edu +comcast.net +scoop.it +skroutz.gr +ilo.org +calameo.com +webnode.com +zdn.vn +zhibo8.cc +amplitude.com +krupdapp.com +gumroad.com +nextdoor.com +cntv.cn +google.com.pe +google.dz +brown.edu +git-scm.com +nj.com +msi.com +nbc.com +cameraddns.net +buydomains.com +gvt3.com +jamnews.com +axisbank.co.in +vogue.com +transferwise.com +logitech.com +tv9marathi.com +blizzard.com +iop.org +mfadsrvr.com +informer.com +about.google +xkcd.com +onlinesbi.sbi +dpreview.com +fema.gov +stuff.co.nz +eventbrite.co.uk +miamiherald.com +ouo.io +pixlr.com +corriere.it +vatican.va +eqxiu.com +examiner.com +cointelegraph.com +hotels.com +ribunews.com +britishcouncil.org +line-apps.com +elle.com +socdm.com +braze.com +mcgill.ca +delta.com +artstation.com +colostate.edu +bankrate.com +reverbnation.com +trulia.com +overstock.com +outbrainimg.com +wipo.int +home.blog +mydrivers.com +space.com +sophos.com +kundelik.kz +esquire.com +goal.com +findlaw.com +serving-sys.com +state.tx.us +uga.edu +pngtree.com +tinder.com +csmonitor.com +walgreens.com +uniqlo.com +searchengineland.com +perl.com +ford.com +tutsplus.com +hrw.org +riotgames.com +openoffice.org +service-now.com +tomshardware.com +bigo.sg +trendmicro.com +livechatinc.com +codecanyon.net +adobelogin.com +wixstatic.com +applovin.com +360yield.com +southcn.com +eonline.com +gpo.gov +pbase.com +joinhoney.com +warnerbros.com +jetbrains.com +morningpost.com.cn +uc.cn +technorati.com +techrepublic.com +blackberry.com +alibabausercontent.com +raspberrypi.org +zazzle.com +isc.org +rumble.com +olx.pl +ebay.fr +uiowa.edu +wired.co.uk +sarkariresult.com +milliyet.com.tr +stitcher.com +fivethirtyeight.com +xero.com +xiami.com +filmix.co +olx.ua +cvent.com +mos.ru +smartsheet.com +dailycaller.com +wikibooks.org +ibytedtos.com +osha.gov +vanguard.com +newsmax.com +unl.edu +000webhost.com +sfu.ca +popularmechanics.com +kp.ru +jpost.com +123rf.com +smashingmagazine.com +nzherald.co.nz +4chan.org +258.com +coingecko.com +metmuseum.org +hpe.com +bustle.com +abc.es +alnaharegypt.com +robinhood.com +people.com.cn +treehugger.com +agoda.com +gap.com +gtmetrix.com +uchi.ru +google.se +popsugar.com +telegra.ph +dawn.com +azcentral.com +eyeota.net +dallasnews.com +douyu.com +appier.net +bell-labs.com +jw.org +wikisource.org +flashtalking.com +monster.com +tdameritrade.com +quantcount.com +esa.int +google.pt +aa.com +wplay.co +uscourts.gov +pushimg.com +tableau.com +lavanguardia.com +gmu.edu +dmm.com +sourceware.org +loom.com +faz.net +ycombinator.com +creditchina.gov.cn +boingboing.net +knowyourmeme.com +dangdang.com +google.com.co +ubisoft.com +mynavi.jp +foreignpolicy.com +amazon.com.br +trontv.com +chip.de +rapid7.com +docin.com +elfagr.com +refinery29.com +popsci.com +arduino.cc +zomato.com +gwu.edu +gmanetwork.com +zdf.de +2345.com +channel4.com +sapo.pt +laodong.vn +monday.com +codepen.io +storify.com +straitstimes.com +fitbit.com +sharethis.com +polygon.com +eee114.com +sonobi.com +vvvdj.com +eghtesadonline.com +clickfunnels.com +us.org +boutell.co.uk +wistia.com +buffer.com +lua.org +cloudinary.com +curl.se +hbo.com +bola.com +chinnica.net +iastate.edu +ucoz.ru +emofid.com +umass.edu +blogspot.com.es +boutell.com +ahrefs.com +secureworks.com +centos.org +indiatoday.in +gyazo.com +bancodevenezuela.com +mydramalist.com +ohchr.org +axios.com +huffingtonpost.co.uk +freewebs.com +animeflv.net +yp.to +oregonlive.com +huim.com +yahoo.net +avcdn.net +zappos.com +aajtak.in +vanderbilt.edu +diigo.com +kooora.com +creditkarma.com +af.mil +archdaily.com +asana.com +blogspot.fr +mpg.de +kohls.com +axs.com +mentalfloss.com +atlasobscura.com +xabbs.com +tribalfusion.com +nodejs.org +squid-cache.org +thebalancecareers.com +iweihai.cn +campaign-archive.com +tabelog.com +kijiji.ca +a8.net +gtimg.cn +parler.com +sueddeutsche.de +malwarebytes.com +starbucks.com +alimama.com +experian.com +bitchute.com +searchenginejournal.com +motor1.com +chaduo.com +screenrant.com +tidaltv.com +focus.de +amnesty.org +strikingly.com +smartcloudcon.com +fsu.edu +wampserver.com +etrade.com +livemint.com +aka.ms +cookpad.com +gogoanime.so +snssdk.com +hc360.com +windowsphone.com +pinterest.co.uk +onetrust.com +gamer.com.tw +id5-sync.com +si.com +tagesschau.de +cloudflare.net +17track.net +consultant.ru +reclameaqui.com.br +baltimoresun.com +cookielaw.org +ksmobile.com +unblog.fr +cairo24.com +livestrong.com +kommersant.ru +hespress.com +clinicaltrials.gov +footprintdns.com +mozgcp.net +powerbi.com +theage.com.au +segment.io +nifty.com +lifo.gr +upi.com +9384.com +aljazeera.net +torob.com +wsu.edu +gfycat.com +iliangcang.com +mhlw.go.jp +home77.com +usf.edu +fbpigeon.com +iso.ch +apkpure.com +sbnation.com +tynt.com +adgrx.com +news18.com +ucsf.edu +caixa.gov.br +elcomercio.com +weblio.jp +epfindia.gov.in +byu.edu +eater.com +smaato.net +clickbank.net +hardened-php.net +consensu.org +bloglovin.com +brightcove.com +chosun.com +nairaland.com +scholastic.com +ebay.it +louisvuitton.com +redtube.com +uu.nl +huffingtonpost.ca +sjtu.edu.cn +mimecast.com +ovh.com +lycos.com +gamestop.com +viglink.com +phicdn.net +lse.ac.uk +fujitsu.com +rice.edu +nyaa.si +nationalpost.com +sxyprn.com +iu.edu +leagueoflegends.com +tass.ru +33across.com +observer.com +anydesk.com +isnssdk.com +made-in-china.com +ajc.com +adage.com +zerohedge.com +mass.gov +bt.com +adroll.com +srvtrck.com +codeplex.com +wbx2.com +rochester.edu +purl.org +rahavard365.com +uwaterloo.ca +online-convert.com +xda-developers.com +hindawi.com +vivoglobal.com +dartmouth.edu +addthisedge.com +google.ae +aofex.com +qiita.com +rsc.org +pku.edu.cn +unam.mx +vine.co +lulu.com +xmnn.cn +cvs.com +campaign-archive1.com +justia.com +cima4u.io +cwi.nl +nap.edu +unsw.edu.au +namebright.com +example.org +blogs.com +stackadapt.com +worldcat.org +motherjones.com +miro.com +11st.co.kr +jiameng.com +91jm.com +michigan.gov +infusionsoft.com +timesonline.co.uk +cri.cn +legacy.com +mheducation.com +eghtesadnews.com +doxygen.nl +globalsources.com +republika.co.id +eu.com +yieldmo.com +buffalo.edu +hamariweb.com +ama-assn.org +ccleaner.com +mmload.com +siteground.com +thepaper.cn +gq.com +convertio.co +typekit.com +caniuse.com +freecodecamp.org +lifehack.org +hbs.edu +thestreet.com +freep.com +hgtv.com +virginia.gov +man7.org +tunein.com +rakuten-sec.co.jp +inews.id +crazyegg.com +siemens.de +clickpost.jp +campaign-archive2.com +timesofisrael.com +orcid.org +auth0.com +wikispaces.com +itmedia.co.jp +doxygen.org +drudgereport.com +sfr.fr +libpng.org +zlib.net +emalls.ir +mk.ru +chartbeat.net +icims.com +motorplus-online.com +liveleak.com +kotaku.com +npmjs.com +owneriq.net +zeotap.com +yourdictionary.com +united.com +anu.edu.au +dyndns.org +ticketmaster.com +stocktwits.com +livescore.com +sporx.com +macrumors.com +emarketer.com +nicsorts-accarade.com +afip.gob.ar +weathercn.com +list-manage2.com +suntimes.com +itv.com +dbankcloud.eu +bufferapp.com +championat.com +sketchfab.com +ebscohost.com +substack.com +google.cz +recaptcha.net +vulture.com +thinkwithgoogle.com +ualberta.ca +gamersky.com +spec.org +samsungosp.com +insertlive.com +jb51.net +urdupoint.com +makeuseof.com +gzip.org +business-standard.com +myntra.com +uoregon.edu +bedbathandbeyond.com +dan.com +parsfootball.com +note.com +rackcdn.com +ameba.jp +heytapdl.com +onlamp.com +lanacion.com.ar +chartbeat.com +sportbible.com +pinduoduo.com +thespruce.com +cootek.com +motorsport.com +rarbg.to +jooble.org +anjuke.com +edublogs.org +google.hu +doodle.com +cern.ch +cafe24.com +campograndenews.com.br +box.net +nd.edu +xianjichina.com +telephony.goog +timeweb.ru +tempo.co +ionos.com +cofile.net +digialm.com +kahoot.it +segment.com +sketchup.com +iubenda.com +medicinenet.com +memcached.org +angel.co +lnkd.in +visa.com +moma.org +justgiving.com +ctrip.com +clevelandclinic.org +basecamp.com +cyol.com +ml314.com +pnc.com +studentaid.gov +pcre.org +flipsnack.com +gazeta.ru +streamable.com +faa.gov +mediawiki.org +biography.com +wikiquote.org +tomsguide.com +rtve.es +eclipse.org +jwplayer.com +nocookie.net +dcinside.com +unity.com +scdn.co +oeeee.com +dropboxapi.com +skysports.com +webdav.org +kitco.com +defense.gov +tawk.to +lazada.co.th +fastcgi.com +retailmenot.com +xrea.com +apple.co +unpkg.com +drom.ru +google.co.il +federalreserve.gov +in.gov +fliphtml5.com +oculus.com +nalog.ru +freetype.org +rkdms.com +dmca.com +apachelounge.com +flightradar24.com +blockchain.com +hh.ru +xhamsterlive.com +cleartax.in +adidas.com +tahiamasr.com +ec-lyon.fr +hubspot.net +bbcollab.com +aclu.org +realestate.com.au +netdna-cdn.com +agah.com +msauth.net +donga.com +heytapmobile.com +formstack.com +pa.gov +infogram.com +pcgamer.com +proquest.com +metacafe.com +study.com +t.cn +rockstargames.com +blogspot.in +root-servers.net +nexusmods.com +firefoxchina.cn +cdiscount.com +tripadvisor.co.uk +createjs.com +fazenda.gov.br +www.gov.br +itch.io +kompas.tv +axisbank.com +enlightenment.org +xmlsoft.org +cc.com +mastercard.com +otvfoco.com.br +yadi.sk +francetvinfo.fr +gamesradar.com +evidon.com +catchthemes.com +mmstat.com +souq.com +protonmail.com +washingtonexaminer.com +sleepycat.com +roku.com +post-gazette.com +cutt.ly +j.mp +rsasecurity.com +goethe.de +3m.com +gab.com +counterpane.com +get-express-vpn.online +rei.com +travelandleisure.com +modsecurity.org +channelnewsasia.com +sozcu.com.tr +staples.com +cio.com +propublica.org +vseigru.net +posterous.com +cars.com +nus.edu.sg +kino-teatr.ru +haaretz.com +foxsports.com +yeniakit.com.tr +generatepress.com +alexametrics.com +bolasport.com +southwest.com +ziprecruiter.com +chinatax.gov.cn +barrons.com +inquirer.com +chromium.org +oath.com +filezilla-project.org +soft98.ir +tsinghua.edu.cn +altavista.com +curbed.com +unimelb.edu.au +wireshark.org +pewinternet.org +rfi.fr +tradedoubler.com +nflxext.com +akstat.io +yandex.net +uky.edu +makemytrip.com +dezeen.com +visual.ly +emory.edu +onmarshtompor.com +znds.com +radissonhotels.com +hwg.org +theaustralian.com.au +hopkinsmedicine.org +quizizz.com +wassenaar.org +pscp.tv +iz.ru +googlepages.com +google.ie +brainyquote.com +twilio.com +serverwatch.com +theintercept.com +memurlar.net +mlive.com +gulfnews.com +yodobashi.com +neobux.com +exoclick.com +sephora.com +kriesi.at +leparisien.fr +ijg.org +syf.com +ems.com.cn +arte.tv +zcool.com.cn +rbcroyalbank.com +ipcc.ch +wistia.net +52pk.com +amazon.jobs +mufg.jp +nsf.gov +discovermagazine.com +ora.com +mofidonline.com +cleveland.com +myftpupload.com +imperial.ac.uk +rakuten-bank.co.jp +lnk.to +protothema.gr +t66y.com +twitpic.com +demon.co.uk +dreamhost.com +apache-ssl.org +matomo.org +ynet.com +acer.com +vungle.com +square.site +churchofjesuschrist.org +manchester.ac.uk +ipredictive.com +caf.fr +usaa.com +icq.com +techsmith.com +sfchronicle.com +discuz.net +pdflib.com +onaudience.com +3dmgame.com +nationalreview.com +zougla.gr +tiqcdn.com +plurk.com +1tv.ru +asriran.com +aappublications.org +newstrend.news +apple.news +scotiabank.com +nber.org +complex.com +uq.edu.au +51job.com +pinterest.de +ukr.net +ohio.gov +jpush.cn +eia.gov +metacritic.com +mozaws.net +freetds.org +uic.edu +storm.mg +teespring.com +sonyliv.com +news.cn +sciencealert.com +cdbaby.com +eluniverso.com +uzone.id +9to5mac.com +line-scdn.net +home.kpmg +hwcdn.net +philly.com +supersonicads.com +akamai.com +kizlarsoruyor.com +technoratimedia.com +weather.com.cn +uh.edu +pressreader.com +rg.ru +ubs.com +missouri.edu +lentainform.com +netgear.com +video-ad-skipper.com +tianqi.com +menshealth.com +specbench.org +pole-emploi.fr +commbank.com.au +theonion.com +kaskus.co.id +flashscore.com +aastocks.com +marthastewart.com +mathworks.com +ynet.co.il +shimo.im +skyrock.com +webthing.com +terra.com.br +theme-fusion.com +penguinrandomhouse.com +kayak.com +msftauth.net +helpshift.com +odoo.com +self.com +orf.at +thenation.com +ucsc.edu +yna.co.kr +acuityplatform.com +url.cn +careerbuilder.com +federalregister.gov +soup.io +mynet.com +perfectdomain.com +fas.org +n-tv.de +lothar.com +virtualbox.org +rd.com +epwk.com +mofcom.gov.cn +etherscan.io +virustotal.com +tampabay.com +sacbee.com +youradchoices.com +lazada.com.ph +cronolog.org +worldofwarcraft.com +ge.com +everydayhealth.com +inmobi.com +yektanet.com +myfitnesspal.com +newsday.com +ct.gov +districtm.io +warcraftlogs.com +verywellmind.com +rit.edu +iplanet.com +hud.gov +nseindia.com +sbs.com.au +snapkit.com +newsearning.com +towardsdatascience.com +idnes.cz +lync.com +cj.com +dbankcloud.cn +webflow.io +cnrs.fr +cargocollective.com +businessinsider.com.au +cafepress.com +seek.com.au +olx.com.br +bit.do +weheartit.com +golux.com +ce.cn +routledge.com +uservoice.com +mewe.com +adotmob.com +harpersbazaar.com +adentifi.com +grammarly.io +unilad.co.uk +windowsazure.com +gucci.com +c-span.org +onelogin.com +5ch.net +meb.gov.tr +microsoftstream.com +zhaket.com +psychcentral.com +ixigua.com +uefa.com +successfactors.com +abebooks.com +nur.kz +ebay.ca +pstatp.com +getresponse.com +ouest-france.fr +bdstatic.com +page.link +gingerall.com +bdimg.com +thestar.com.my +prnt.sc +health.com +phpmyadmin.net +kemdikbud.go.id +cloudsink.net +manyvids.com +yoast.com +radio.com +newatlas.com +topfo.com +eshkol.io +mcdonalds.com +apachetutor.org +skimresources.com +google.dk +blogspot.com.au +symcb.com +39.net +payoneer.com +legislation.gov.uk +banesconline.com +lun.com +sabq.org +unhcr.org +byjus.com +zhaopin.com +google.fi +tp-link.com +mercadolibre.com.ve +gh0089.com +studiopress.com +wholefoodsmarket.com +fril.jp +futbin.com +canon.com +opaque.net +socialblade.com +cms.gov +dagospia.com +utk.edu +utah.gov +pulzo.com +hometax.go.kr +sinaimg.cn +bankmellat.ir +kidshealth.org +special-offers.online +xs4all.nl +spectrum.net +bitdefender.com +thethao247.vn +figma.com +vivo.com.cn +onlinedown.net +kaiserpermanente.org +blog.com +blender.org +moatpixel.com +pitchfork.com +dalfak.com +ryanair.com +pardot.com +aspnetcdn.com +dnevnik.ru +iobit.com +ahajournals.org +onedrive.com +apachehaus.com +habr.com +olx.in +gmarket.co.kr +ibyteimg.com +7-zip.org +documentcloud.org +oraclecloud.com +mykajabi.com +vg.no +aizhan.com +poste.it +uconn.edu +yellowpages.com +yorku.ca +netperf.org +mercadolibre.com.co +dbankcdn.com +dyntrk.com +temple.edu +avaz.ba +netease.com +sedo.com +seasonvar.ru +viki.com +simplesite.com +zopim.com +opensooq.com +verizonmedia.com +hangseng.com +lynda.com +gsu.edu +haibunda.com +warwick.ac.uk +billdesk.com +endclothing.com +ptt.cc +springerlink.com +affordable-papers.net +ksl.com +gawker.com +forrester.com +postimg.org +firebaseio.com +kbb.com +syr.edu +adf.ly +nj.gov +ibtimes.co.uk +code.org +ucr.edu +rakuten.ne.jp +honeywell.com +abb.com +bookdepository.com +moe.gov.cn +omnitagjs.com +gla.ac.uk +louvre.fr +infowars.com +thrillist.com +advangelists.com +meraki.com +abril.com.br +skillshare.com +python.ca +thrtle.com +independent.ie +squidoo.com +edmunds.com +anrdoezrs.net +wegotthiscovered.com +sendgrid.net +babycenter.com +usp.br +pendo.io +u-tokyo.ac.jp +bluestacks.com +real.com +cloudwaysapps.com +sehatq.com +pearltrees.com +autohome.com.cn +ebrun.com +xjtu.edu.cn +sendspace.com +bravesites.com +geocities.jp +eklablog.com +grubhub.com +daraz.pk +kknews.cc +jst.go.jp +ngacn.cc +imagemagick.com +freejobalert.com +hepsiburada.com +excite.co.jp +videojs.com +dummies.com +kaspersky-labs.com +apachetoday.com +eksisozluk.com +autotrader.com +verywellhealth.com +icbc.com.cn +bola.net +apple-cloudkit.com +shopee.sg +eluniversal.com.mx +freee.co.jp +nelreports.net +royalsocietypublishing.org +drugs.com +americanas.com.br +infoworld.com +net-a-porter.com +thebrighttag.com +ecwid.com +ansa.it +ksmobile.net +wto.org +univie.ac.at +cylance.com +mcusercontent.com +slatic.net +poshmark.com +ubereats.com +ti.com +morningstar.com +cheezburger.com +mapbox.com +sandiegouniontribune.com +emojipedia.org +blog.jp +oregon.gov +cengage.com +ibiblio.org +semasio.net +verizonwireless.com +networksolutions.com +wi.gov +copyright.gov +helsinki.fi +recode.net +usmagazine.com +useinsider.com +rte.ie +stripchat.com +rapidshare.com +uw.edu +ib-ibi.com +tenable.com +dostor.org +rsafrwd.com +51auto.com +thebalancesmb.com +newrepublic.com +game-mode.net +gopro.com +forgeofempires.com +wolframalpha.com +edh.tw +kdslife.com +kinja.com +kochava.com +eu.org +rakuten-card.co.jp +msk.ru +miami.edu +crictracker.com +wikileaks.org +legit.ng +yenisafak.com +page.tl +govdelivery.com +business2community.com +ezinearticles.com +ovhcloud.com +digitaljournal.com +google.bg +one.com +impact-ad.jp +bisnis.com +creditonebank.com +gds.it +angieslist.com +bestlifeonline.com +aniview.com +indiewire.com +awin1.com +bodybuilding.com +appleinsider.com +ifeng.com +pingdom.com +core.ac.uk +ifttt.com +winzip.com +illinois.gov +mo.gov +kompasiana.com +subscene.com +victoriassecret.com +hops.id +threebit.net +24h.com.vn +kugou.com +moneyforward.com +leeds.ac.uk +jsonline.com +githubusercontent.com +wikiwand.com +blogspot.it +blogfa.com +247sports.com +informationweek.com +thekitchn.com +mediatek.com +sun-sentinel.com +wartaekonomi.co.id +bartleby.com +match.com +here.com +jkforum.net +mawdoo3.com +wbur.org +bigcommerce.com +bitcoin.org +gigaom.com +scotsman.com +gog.com +joins.com +nngroup.com +origin.com +parivahan.gov.in +unixtools.org +nme.com +aaa.com +humblebundle.com +sbisec.co.jp +mosalasonline.com +fraunhofer.de +chooseauto.com.cn +lordfilms-s.pw +mobile.de +geotrust.com +politico.eu +sitepoint.com +ucalgary.ca +csod.com +gnome.org +autoblog.com +manganelo.com +phoca.cz +juntadeandalucia.es +mainichi.jp +dior.com +news24.com +tgju.org +truoptik.com +nflxvideo.net +1post4all.com +rappler.com +torproject.org +mybigcommerce.com +sabah.com.tr +smarturl.it +theknot.com +mobafire.com +aa.com.tr +ocregister.com +abc.com +whiterabbitpress.com +google.sk +unm.edu +ubi.com +zapier.com +opensuse.org +pp.ua +starwars.com +alibabadns.com +a-msedge.net +packagist.org +onelink.me +google.co.nz +blismedia.com +narcity.com +financialexpress.com +samsclub.com +exblog.jp +hi.ru +unfccc.int +ets.org +liberation.fr +deepintent.com +nrk.no +realclearpolitics.com +jpmorganchase.com +statnews.com +alarabiya.net +windy.com +coccoc.com +apartmenttherapy.com +beijing.gov.cn +al.com +ojooo.com +maryland.gov +toyota.com +ctnsnet.com +lotterypost.com +yle.fi +mercola.com +52pojie.cn +waze.com +pconline.com.cn +selfridges.com +sbrf.ru +thinkific.com +dnb.com +lieyunwang.com +legifrance.gouv.fr +avaaz.org +queensu.ca +networkworld.com +timeshighereducation.com +ourworldindata.org +colorado.gov +jwpcdn.com +eltiempo.com +thecut.com +udacity.com +aps.org +express.dhl +idc.com +orlandosentinel.com +thebase.in +premierbet.co.ao +fws.gov +v2ex.com +adskeeper.co.uk +thenational.ae +mercantilbanco.com +dailystar.co.uk +chronicle.com +glpals.com +36kr.com +samplicio.us +tate.org.uk +sc-cdn.net +plex.tv +wolfram.com +gearbest.com +insidehighered.com +ask.fm +ucf.edu +myway.com +detroitnews.com +samhsa.gov +creativemarket.com +lesechos.fr +porsche.com +medicalxpress.com +stockx.com +ss2.us +allmusic.com +blurb.com +doodlekit.com +ipv4only.arpa +12306.cn +speakerdeck.com +dellsupportcenter.com +glamour.com +e-recht24.de +dailykos.com +mxc.com +betrad.com +line.biz +imgix.net +bfmtv.com +ccb.com +beyla.site +company-target.com +northeastern.edu +atwola.com +wshareit.com +ku.edu +instapaper.com +presscustomizr.com +gazeta.pl +dhl.de +11467.com +yomiuri.co.jp +libreoffice.org +healthcare.gov +bleepingcomputer.com +entrust.net +startpage.com +wbidder.online +getintopc.com +tut.by +eurogamer.net +virgilio.it +bittorrent.com +guru99.com +elmostaqbal.com +caranddriver.com +126.com +nest.com +uvm.edu +peatix.com +curseforge.com +kpmg.com +pikabu.ru +mangadex.org +sprint.com +military.com +avira.com +homeadvisor.com +gridoto.com +dji.com +whitepages.com +instacart.com +amazon.ae +resumersvo.fun +uio.no +americanbar.org +datadoghq.com +fotor.com +meizu.com +tenor.com +reason.com +rand.org +shrm.org +pantip.com +theweek.com +guinnessworldrecords.com +jigsy.com +a2z.com +uptobox.com +prom.ua +liebertpub.com +zdassets.com +fsf.org +cbp.gov +wpastra.com +emirates.com +alz.org +usdoj.gov +accorhotels.com +mariadb.org +txxx.com +zingnews.vn +rtl-theme.com +canadapost.ca +swagbucks.com +6pm.com +womenshealthmag.com +dhgate.com +cryoutcreations.eu +searchenginewatch.com +hsbc.com.hk +diply.com +bfmio.com +mercadolibre.com +javatpoint.com +symantecliveupdate.com +credit-agricole.fr +adingo.jp +urbanoutfitters.com +site123.me +westernunion.com +wordstream.com +fourseasons.com +nationalarchives.gov.uk +foodmate.net +huya.com +google.lk +rackspace.com +tagesspiegel.de +fwmrm.net +20minutos.es +odnoklassniki.ru +cmoney.tw +cuevana3.io +hawaaworld.com +templatemonster.com +lexisnexis.com +handelsblatt.com +3isk.video +designboom.com +popads.net +fu-berlin.de +uva.nl +canalblog.com +bc.edu +ifixit.com +afr.com +bcg.com +s-microsoft.com +scielo.br +appledaily.com +naturalnews.com +themezee.com +encyclopedia.com +coconala.com +daftsex.com +abema.tv +heytapdownload.com +unep.org +signal.org +haberturk.com +ad-m.asia +blogtalkradio.com +freepik.es +znanija.com +hku.hk +epfl.ch +lmgtfy.com +adcolony.com +realsimple.com +google.hr +britishairways.com +html5up.net +cfsbcn.com +garenanow.com +elespanol.com +honda.com +supremecourt.gov +familydoctor.com.cn +hackernoon.com +google.no +inmotionhosting.com +yinyuetai.com +fanfiction.net +correios.com.br +prtimes.jp +teacherspayteachers.com +rapidgator.net +clean.gg +drexel.edu +lsu.edu +geekwire.com +drugabuse.gov +sydney.edu.au +narvar.com +adverdirect.com +bhg.com +cjb.net +bmo.com +seagate.com +iea.org +stltoday.com +windscribe.com +nhtsa.gov +khaleejtimes.com +nta.go.jp +investors.com +parents.com +docusign.com +myniceposts.com +comodo.com +firstpost.com +brand-display.com +bgr.com +crhoy.com +splashthat.com +clickbank.com +akhbarelyom.com +eum-appdynamics.com +jivox.com +plosone.org +readthedocs.org +ala.org +topuniversities.com +netvibes.com +wilfulpessimistic.com +on24.com +css-tricks.com +labanquepostale.fr +oxforddictionaries.com +asm.org +lazada.com.my +cibc.com +buzzsprout.com +myportfolio.com +sentry.io +jut.su +sznews.com +mxplayer.in +google.rs +wyndhamhotels.com +payscale.com +ssionsupre.fun +cnil.fr +diabetes.org +beeg.com +virtualearth.net +chanel.com +bestbuy.ca +extremetech.com +kenh14.vn +postrelease.com +kansascity.com +esri.com +successfactors.eu +afp.com +pbworks.com +pagesix.com +dailydot.com +so-net.ne.jp +cretgate.com +visitsfunk.com +theblaze.com +intensedebate.com +fang.com +myvisualiq.net +dramacool.so +kaggle.com +apartments.com +foodandwine.com +ebayimg.com +mindmeister.com +cox.net +boardgamegeek.com +nwsource.com +polyfill.io +politifact.com +financialpost.com +prensalibre.com +cmbchina.com +oprah.com +opentable.com +ultipro.com +java.net +pochta.ru +seesaw.me +canada.com +people.cn +wangdaidongfang.com +digiato.com +drive2.ru +ewg.org +tmtpost.com +nola.com +jalopnik.com +smallbiztrends.com +klarna.com +laweekly.com +gd.gov.cn +paris.fr +slack-edge.com +vtv.vn +razer.com +active.com +afreecatv.com +creativebloq.com +mirconnect.ru +collinsdictionary.com +google.com.bd +king.com +elconfidencial.com +epicurious.com +bp.com +we.tl +benzinga.com +lbl.gov +nla.gov.au +home.pl +searchsecurer.com +macworld.com +kcl.ac.uk +ocn.ne.jp +poetryfoundation.org +pri.org +laravel.com +sf-express.com +wanfangdata.com.cn +finviz.com +gao.gov +chetor.com +kafan.cn +thespruceeats.com +biblehub.com +yoomoney.ru +loopme.me +icy-veins.com +laposte.fr +law.com +tightsaturdayi.com +jwpltx.com +bonappetit.com +britishmuseum.org +oanda.com +tap.az +bcebos.com +rawstory.com +cargurus.com +vectorstock.com +zozo.jp +thewrap.com +ballotpedia.org +techspot.com +vcu.edu +yamaha.com +seesaa.net +city-data.com +tinkoff.ru +freebitco.in +netsuite.com +tilltucked.com +familysearch.org +plannerladyreality.com +rozetka.com.ua +penzu.com +baskino.me +thinkprogress.org +aftonbladet.se +sf.net +pbskids.org +leetcode.com +marvel.com +getcomposer.org +iol.co.za +puma.com +couchsurfing.com +dstv.com +bb.com.br +disquscdn.com +nexon.com +news-medical.net +cas.cn +officedepot.com +authorize.net +filehorse.com +authorstream.com +socialmediatoday.com +oneindia.com +swissinfo.ch +y8.com +delish.com +artnet.com +rediffmailpro.com +panoramio.com +smore.com +xgo.com.cn +food.com +manta.com +oppositehometowndrunken.com +uvic.ca +jjwxc.net +chinatimes.com +ntu.edu.tw +biorxiv.org +esheeq.co +csiro.au +ndr.de +ui.com +motorola.com +bl.uk +cybozu.com +ranker.com +pinterest.fr +woot.com +siriusxm.com +walmart.ca +xerox.com +u.gg +opencart.com +ring.com +imo.im +medicare.gov +nrdc.org +archive.is +launchdarkly.com +dzone.com +worldwildlife.org +shahid4u.onl +cuhk.edu.hk +sc.edu +windowscentral.com +workable.com +kajabi.com +coub.com +adhigh.net +kinokrad.co +standardmedia.co.ke +nfpa.org +discuss.com.hk +ad4m.at +chatwork.com +reviewjournal.com +tsichuan.com +rookmemorizevoluntary.com +vistaprint.com +caliente.mx +rug.nl +gandi.net +andersnoren.se +softbank.jp +photoshelter.com +chewy.com +cineulagam.com +dickssportinggoods.com +alternet.org +esy.es +bzw315.com +krakow.pl +5118.com +google.kz +headspace.com +geocaching.com +adoptapet.com +wikipedia.com +tentmess.com +imooc.com +189.cn +manchestereveningnews.co.uk +medrxiv.org +askubuntu.com +wikimapia.org +enamad.ir +linecorp.com +helpguide.org +zarinpal.com +dbankcloud.asia +noon.com +baomoi.com +splittingpick.com +cracked.com +theculturetrip.com +sciencenews.org +tradingeconomics.com +fotolia.com +sc.com +haplat.net +letterboxd.com +asp.net +pinterest.es +ontario.ca +cna.com.tw +essayswriting.org +bshare.cn +cpsc.gov +pantone.com +bdurl.net +parliament.uk +igvita.com +bbcgoodfood.com +tvguide.com +jdoqocy.com +lazada.co.id +unrealengine.com +loupan.com +sch.gr +newstatesman.com +smartasset.com +jio.com +in.net +cfr.org +trustedreviews.com +comicbook.com +boeing.com +lufthansa.com +nikkeibp.co.jp +acuityscheduling.com +cntraveler.com +mrporter.com +5acbd.com +listindiario.com +ekaie.com +ingentaconnect.com +royalcbd.com +moonfruit.com +rqmob.com +movieweb.com +kaltura.com +siteorigin.com +vocabulary.com +1377x.to +ericsson.com +spring.io +images-amazon.com +vidible.tv +service-public.fr +similarweb.com +signupgenius.com +bostonherald.com +meti.go.jp +ssc.nic.in +notepad-plus-plus.org +klikbca.com +internetdownloadmanager.com +nypl.org +meduza.io +themuse.com +heytapimg.com +mfisp.com +bayern.de +synology.com +androidauthority.com +andhrajyothy.com +unt.edu +9anime.to +eenadu.net +sdsu.edu +jagranjosh.com +gumtree.com.au +epochtimes.com +dantri.com.vn +otto.de +smashballoon.com +intercom.io +wdr.de +bigthink.com +kuaidi100.com +ncsl.org +seriouseats.com +dividedscientific.com +kinopoisk.ru +tesco.com +zee5.com +skypeassets.com +innovid.com +google.lt +sltrib.com +cocolog-nifty.com +2gis.ru +genpi.co +kuleuven.be +hivestreaming.com +surveygizmo.com +wnd.com +ky.gov +dzwww.com +foxitsoftware.com +omnithrottle.com +fnac.com +architecturaldigest.com +xsrv.jp +izooto.com +zoomit.ir +nordvpn.com +lazada.vn +revcontent.com +onmicrosoft.com +bhaskar.com +irna.ir +sdpnoticias.com +desmos.com +googlecode.com +xtx6.com +iata.org +nottingham.ac.uk +aap.org +ilsole24ore.com +kotak.com +waveapps.com +wizards.com +tkqlhce.com +hawaii.gov +time.ir +greenend.org.uk +clever.com +blip.tv +haberler.com +tudelft.nl +calculator.net +tarafdari.com +absher.sa +rayjump.com +biggo.com.tw +theoutnet.com +ruten.com.tw +symfony.com +cdn.house +ameli.fr +ig.com.br +www.gob.mx +gaana.com +zimbra.com +sastasundar.com +adkernel.com +dpbolvw.net +madrasati.sa +goldcarpet.cn +avclub.com +poki.com +society6.com +main.jp +2m.ma +comscore.com +xiu.com +uni-muenchen.de +usaid.gov +brave.com +districtbaloneywhiskers.com +isna.ir +sina.com +elitedaily.com +ethetrader.com +thejakartapost.com +fnb.co.za +iasds01.com +sleepfoundation.org +yammer.com +okcupid.com +ispot.tv +royalmail.com +hotpepper.jp +doswinuba.com +sproutsocial.com +energystar.gov +shopbop.com +tiki.vn +ohio-state.edu +umontreal.ca +diariolibre.com +tiscali.it +ons.gov.uk +comcast.com +erne.co +steemit.com +9978.cn +emerald.com +panda.org +justpaste.it +novinky.cz +heroku.com +bmi.ir +cbcloud.sg +codeproject.com +logmein.com +roozaneh.net +youjizz.com +mailerlite.com +businessnewsdaily.com +google.by +airtable.com +postimg.cc +kqzyfj.com +aadrm.com +rajasthan.gov.in +binomo-website.com +bangkokpost.com +mediapost.com +mostaghelonline.com +blogspot.nl +steamstatic.com +china.org.cn +spanishdict.com +qualcomm.com +unh.edu +wps.cn +toppr.com +kff.org +sxsw.com +slack-imgs.com +kqed.org +s-onetag.com +tfl.gov.uk +stern.de +se.com +9lianmeng.com +screencast.com +fmkorea.com +olx.com.pk +airtel.in +insightexpressai.com +enstage-sas.com +1password.com +mediaset.it +easeus.com +boxofficemojo.com +dream.co.id +privatbank.ua +hermes.com +turnitin.com +aboutamazon.com +adelaide.edu.au +infolinks.com +sears.com +spglobal.com +smashwords.com +pypi.org +nouvelobs.com +trustwave.com +turbo.az +fordham.edu +lastampa.it +exe.app +gjirafa.com +bitcointalk.org +gadgetnews.net +auction.co.kr +safedog.cn +cardinalcommerce.com +paypalobjects.com +ucoz.com +famethemes.com +khtahmar.com +saksfifthavenue.com +fiu.edu +kiplinger.com +volvocars.com +ou.edu +sparknotes.com +pchouse.com.cn +wowkeren.com +google.com.ly +printfriendly.com +footprint.net +blogspot.com.br +bancoestado.cl +360safe.com +filehippo.com +toptal.com +suntrust.com +nghttp2.org +sh.gov.cn +tvtropes.org +specificfeeds.com +ku.dk +wzrkt.com +countryliving.com +olymptrade.com +jezebel.com +amazonvideo.com +hk01.com +activehosted.com +heritage.org +greenhouse.io +fineartamerica.com +apple.com.cn +pastemagazine.com +grab.com +euractiv.com +ghost.org +arzdigital.com +intentiq.com +commondreams.org +docdroid.net +freehostia.com +peta.org +berlin.de +y5en.com +yxdown.com +download.com +bol.com +pdffiller.com +jnu.edu.cn +lever.co +gencat.cat +vancouversun.com +androidcentral.com +fendi.com +quantcast.com +snapwidget.com +indosport.com +messefrankfurt.com +toto803.com +bnf.fr +mayoclinic.com +chinajsq.cn +clck.ru +ssacdn.com +suumo.jp +uni-heidelberg.de +tdbank.com +cda.pl +mongodb.com +adition.com +uu.se +miitbeian.gov.cn +tver.jp +scroll.in +ncl.ac.uk +scopus.com +anandtech.com +inhabitat.com +lkqd.net +derstandard.at +storiespace.com +e-planning.net +nobitex.ir +shop-pro.jp +inkscape.org +case.edu +maine.gov +leo.org +umblr.com +myfreecams.com +ppomppu.co.kr +prntscr.com +msu.ru +globaltimes.cn +ml.com +usgbc.org +xataka.com +dangbei.com +pagesperso-orange.fr +icicidirect.com +kargo.com +mxptint.net +alipayobjects.com +qodeinteractive.com +toggl.com +aliapp.org +walkme.com +smartrecruiters.com +gtimg.com +fashionfindday.com +blogspot.jp +whatismyipaddress.com +media6degrees.com +ndrc.gov.cn +ispconfig.org +sas.com +jcpenney.com +rpi.edu +hypebeast.com +marketo.com +vivo.com +miaopai.com +paytm.in +fox.com +myfonts.com +trust.org +bahn.de +mytheresa.com +minds.com +worktile.com +mikecrm.com +schema.org +thriveglobal.com +vanguardngr.com +reliefweb.int +denetsuk.com +revjet.com +esteri.it +12377.cn +business.com +megaupload.com +gesetze-im-internet.de +tripadvisor.in +giglio.com +ladsp.com +yc58.com +marieclaire.com +handle.net +sony.net +farsnews.ir +producthunt.com +extend.tv +hinet.net +seattlepi.com +soap2day.to +lazada.com +nato.int +mcmaster.ca +jmw.com.cn +brainly.in +shopstyle.com +unige.ch +jagran.com +ltn.com.tw +uc.edu +10010.com +tasnimnews.com +ecv360.com +futurelearn.com +gnupg.org +rferl.org +businessinsider.in +nsatc.net +kth.se +ritzcarlton.com +mihoyo.com +nami.org +postermywall.com +spiceworks.com +kremlin.ru +oxu.az +dnaindia.com +starwoodhotels.com +avct.cloud +rezync.com +10jqka.com.cn +sc-static.net +adbtc.top +thinkgeek.com +ximalaya.com +owasp.org +interfax.ru +aporasal.net +r-project.org +gm.com +clemson.edu +geico.com +archlinux.org +ustc.edu.cn +idealo.de +nate.com +nbcnewyork.com +oclc.org +n11.com +dailytelegraph.com.au +365jia.cn +e-monsite.com +greatandhra.com +acrobat.com +akc.org +stlouisfed.org +rae.es +cloudconvert.com +histats.com +nolo.com +edweek.org +pullcm.com +tennessean.com +folkd.com +mof.gov.cn +inquisitr.com +priceline.com +dr.dk +getty.edu +mosreg.ru +dable.io +onetag-sys.com +sports.ru +patagonia.com +webtoons.com +allocine.fr +tilda.ws +alamy.com +unodc.org +likee.video +businessweekly.com.tw +hightail.com +doramy.club +burberry.com +liftoff.io +newsobserver.com +newsnow.co.uk +sanook.com +newswire.ca +tamin.ir +masterclass.com +mango.com +blog.ir +ada.support +vietnamnet.vn +marinetraffic.com +charlotteobserver.com +sportingnews.com +tci.ir +mwbsys.com +marketingland.com +baiducontent.com +smbc-card.com +agacelebir.com +br.de +bldrdoc.gov +piktochart.com +samsungcloudsolution.net +nybooks.com +neimanmarcus.com +onenote.com +vam.ac.uk +bitcoin.com +123-reg-new-domain.co.uk +uwo.ca +lexpress.fr +houstonchronicle.com +ad-stir.com +invisionapp.com +europa.eu.int +lu.se +jpush.io +aralego.com +yjc.ir +zimbio.com +fararu.com +paytm.com +shell.com +kissmetrics.com +animoto.com +suning.com +emeraldinsight.com +elcomercio.pe +sophosupd.com +secondlife.com +infoplease.com +netcraft.com +lichess.org +svc.ms +usu.edu +rightmove.co.uk +vesti.ru +elastic.co +mindbodyonline.com +ready.gov +band.us +collider.com +visualcapitalist.com +york.ac.uk +ua.edu +xiaohongshu.com +edgyconnaterag.com +wn.com +dailyrecord.co.uk +qualys.com +open.ac.uk +pelisplus.me +tsa.gov +dingtalk.com +telekom.com +pornhubpremium.com +wenming.cn +sakshi.com +omegle.com +webinarjam.com +namava.ir +courant.com +wwd.com +nespresso.com +allafrica.com +qatarairways.com +iefimerida.gr +vmall.com +thedrum.com +digiday.com +pullcf.com +tasteofhome.com +123c.vn +takealot.com +eeoc.gov +wargaming.net +state.co.us +dc.gov +d1net.com +1fichier.com +xhamster7.desi +wanadoo.fr +bazaarvoice.com +boohoo.com +badoo.com +gatesfoundation.org +unwomen.org +findarticles.com +adtelligent.com +ul.com +encuentra24.com +ana.co.jp +citylab.com +bizrate.com +mobile01.com +mendeley.com +jwplatform.com +torontosun.com +google.co.ao +draftkings.com +monash.edu +itau.com.br +buyma.com +inverse.com +jetpack.com +pmi.org +tribune.com.pk +12371.cn +realtor.ca +st-andrews.ac.uk +gazetaexpress.com +outsideonline.com +antaranews.com +bom.gov.au +ghanaweb.com +daringfireball.net +survata.com +inner-active.mobi +shiksha.com +jumia.com.ng +supersonic.com +tripsavvy.com +viralporn.com +hnu.edu.cn +google.com.do +lamabang.com +lolesports.com +overdrive.com +corpscorp.online +yallakora.com +lancers.jp +cedexis-radar.net +vecteezy.com +luisaviaroma.com +litres.ru +nos.nl +ally.com +mic.com +slashgear.com +podomatic.com +mgtv.com +awwwards.com +econsultancy.com +somoynews.tv +mensjournal.com +zoominfo.com +netcoresmartech.com +wechat.com +battlenet.com.cn +topsy.com +runnersworld.com +scene7.com +mediaroom.com +google.com.ec +bts.gov +admanmedia.com +rapidssl.com +gazzettadelsud.it +aip.org +ipstatp.com +lepoint.fr +redditstatic.com +plala.or.jp +mercadolibre.cl +alsbbora.info +sudannews365.org +archiveofourown.org +inven.co.kr +care2.com +nzz.ch +export.gov +sheknows.com +offerup.com +mvideo.ru +vestacp.com +turner.com +ecollege.com +ru.com +thehindubusinessline.com +forexfactory.com +ello.co +computerhope.com +tum.de +baylor.edu +f-secure.com +gst.gov.in +infoseek.co.jp +adyen.com +getui.com +salesforceliveagent.com +ig.com +kodak.com +dominos.com +openedition.org +4px.com +golang.org +b2clogin.com +51cto.com +morganstanley.com +tagged.com +djangoproject.com +4channel.org +unicode.org +anthropologie.com +townhall.com +doctolib.fr +adrta.com +lendingtree.com +idaho.gov +coinpayu.com +heraldsun.com.au +smzdm.com +ennaharonline.com +voot.com +lightwidget.com +viu.com +20minutes.fr +proboards.com +revenuenetworkcpm.com +sport.es +punchng.com +kiva.org +computerweekly.com +mail.com +danawa.com +almubasher.com.sa +webself.net +gov.ao +american.edu +statefarm.com +skynet.be +zotero.org +openculture.com +follow.it +hpjav.tv +cra-arc.gc.ca +cyberpolice.cn +nyt.com +jalbum.net +trademe.co.nz +fortunecity.com +nhentai.net +marketplace.org +faithfulfacultativeladder.com +alaska.gov +indystar.com +kyoto-u.ac.jp +rt.ru +expressvpn.com +sofascore.com +smu.edu +cint.com +envato.market +newsvine.com +cinecalidad.is +europapress.es +ucm.es +researchnow.com +sharefile.com +cato.org +google.com.kw +democracynow.org +americanprogress.org +pluralsight.com +elintransigente.com +blackboardcdn.com +126.net +cettire.com +de.tl +szu.edu.cn +bepress.com +popmama.com +c212.net +whmcs.com +michaels.com +funnyordie.com +sendinblue.com +tapjoy.com +georgia.gov +certum.pl +mundodeportivo.com +poznan.pl +acpjournals.org +jawapos.com +most.gov.cn +adobeconnect.com +commonsensemedia.org +trthaber.com +iaea.org +iflscience.com +netfirms.com +futurism.com +rutube.ru +tn.gov +alberta.ca +koreatimes.co.kr +citilink.ru +ato.gov.au +adafruit.com +teenvogue.com +winbank.gr +convio.net +socialmediaexaminer.com +france.tv +fastcodesign.com +shape.com +nature.org +inria.fr +dikaiologitika.gr +streamlabs.com +globalresearch.ca +degruyter.com +ub.edu +kobo.com +easyjet.com +redlink.com.ar +guidestar.org +ulta.com +faradars.org +jpmorgan.com +arynews.tv +zju.edu.cn +jit.si +xunlei.com +shafaqna.com +mnn.com +nationalacademies.org +estadao.com.br +oath.cloud +bing.net +thunderbird.net +formula1.com +alibaba-inc.com +cabelas.com +worldoftanks.eu +tubemogul.com +upstox.com +brainly.com +o2.pl +childrensalon.com +22.cn +cancerresearchuk.org +mediaite.com +homes.co.jp +gigabyte.com +milenio.com +equifax.com +desmoinesregister.com +easytomessage.com +govtrack.us +cic.gc.ca +audioboom.com +gaadiwaadi.com +msidentity.com +azlyrics.com +cplusplus.com +google.co.ma +threadless.com +express.pk +cheatsheet.com +discordapp.net +indozone.id +nab.com.au +samsungcloudsolution.com +glassdoor.co.in +db.tt +popin.cc +onstunkyr.com +slideserve.com +arbeitsagentur.de +wayne.edu +itunes.com +mix.com +cision.com +radikal.ru +xserver.ne.jp +petapixel.com +studfile.net +crowd1.com +doe.gov +dw.de +gsis.gr +aafp.org +b2b.cn +fireeye.com +like.video +9news.com.au +jal.co.jp +jiemian.com +bible.com +stats.gov.cn +truecaller.com +cimaclub.in +ruliweb.com +kq36.com +govinfo.gov +kissasian.sh +mindtools.com +vogue.co.uk +medcom.id +androidpolice.com +techdirt.com +webull.com +xm.com +samsungdm.com +amgdgt.com +eugdpr.org +argaam.com +eccn.com +ip138.com +oecd-ilibrary.org +ipsos.com +lapatilla.com +kia.com +sqlite.org +themoscowtimes.com +alltrails.com +artsy.net +brealtime.com +bplaced.net +gsk.com +rtactivate.com +zenfolio.com +cn163.net +brainly.co.id +kahoot.com +fmprc.gov.cn +worthpoint.com +digitaloceanspaces.com +annals.org +magento.com +vitalsource.com +brassring.com +dispatch.com +wikitravel.org +informa.com +elperiodico.com +thermofisher.com +io9.com +deseret.com +italist.com +adtimaserver.vn +subito.it +quillbot.com +1plus1tv.ru +pdx.edu +godwineagles.org +syracuse.com +mindbodygreen.com +gxu.edu.cn +planalto.gov.br +otzovik.com +imore.com +tinkercad.com +julian-fashion.com +nga.cn +iyfubh.com +linguee.com +ntu.edu.sg +checkpoint.com +indiaglitz.com +garant.ru +mabanque.bnpparibas +epson.com +ip-api.com +stripe.network +sat.gob.mx +abchina.com +aif.ru +allure.com +bayer.com +postimage.org +privat24.ua +amnh.org +chaoxing.com +fca.org.uk +sciencenet.cn +mt.gov +liu.se +userreport.com +tuwien.ac.at +eldiario.es +s-msedge.net +wbs-law.de +finra.org +makezine.com +rwth-aachen.de +veoh.com +blogspot.se +abc7.com +fudan.edu.cn +inps.it +carbonmade.com +kinsta.com +hpage.com +ccavenue.com +photopea.com +okaz.com.sa +boe.es +e-hentai.org +conac.cn +cox.com +hotjar.io +cinemablend.com +newindianexpress.com +cigna.com +nu.nl +htc.com +keybase.io +crateandbarrel.com +ccm.net +liveonscore.tv +shaw.ca +laleggepertutti.it +izatcloud.net +venmo.com +iqoption.com +auspost.com.au +webshots.com +turkishairlines.com +arabnews.com +paloaltonetworks.com +trilltrill.jp +foreignaffairs.com +delaware.gov +ontvtime.ru +contentmarketinginstitute.com +piliapp.com +google.lv +ali213.net +mingpao.com +capterra.com +nv.gov +tf1.fr +wfp.org +d1ev.com +blogspot.ru +ovh.co.uk +opera-mini.net +hasbro.com +tapatalk.com +gigya.com +youzan.com +zoosnet.net +allstate.com +chinagate.cn +wnyc.org +wowma.jp +buyma.us +kroger.com +impots.gouv.fr +xboxab.com +valuecommerce.com +infourok.ru +adskeeper.com +wps.com +mega.co.nz +uab.edu +undertone.com +smartinsights.com +pfizer.com +lexico.com +exeter.ac.uk +lyft.com +ibanking-services.com +taroot-rangi.com +google.iq +telenet.be +republicworld.com +pantheonsite.io +iam.gov.sa +uzh.ch +rokna.net +worldstarhiphop.com +kongregate.com +google.com.ng +searchmulty.com +pushails.com +collabserv.com +a-mo.net +cuni.cz +cincinnati.com +mofa.go.jp +tau.ac.il +westerndigital.com +pinterest.jp +gazzetta.it +emailmeform.com +mob.com +spectator.co.uk +poynter.org +malavida.com +consumerfinance.gov +auburn.edu +europepmc.org +ada.org +uib.no +7po.com +bonanza.com +tenor.co +myflorida.com +dytt8.net +popdaily.com.tw +edupage.org +nationalgeographic.org +peacocktv.com +emb-japan.go.jp +popbela.com +purevolume.com +techweb.com.cn +mlit.go.jp +syosetu.com +ccaonline.cn +vuejs.org +yr.no +nttdocomo.co.jp +usask.ca +66cruises.com +fullhdfilmizlesene.com +newgrounds.com +aspca.org +ai.marketing +ktla.com +broadwayworld.com +kit.edu +techtimes.com +sc-prod.net +beatport.com +flippingbook.com +goldmansachs.com +tim.it +docomo.ne.jp +domestika.org +uptodate.com +campaignmonitor.com +nsdl.com +storyblocks.com +ezgif.com +progressive.com +mercedes-benz.com +sdamgia.ru +toasttab.com +brunarosso.com +metoffice.gov.uk +domdex.com +pandasecurity.com +timesunion.com +marksandspencer.com +ichano.cn +web-hosting.com +b-cdn.net +weiyun.com +sportskeeda.com +urbanairship.com +instyle.com +tidal.com +harpercollins.com +overleaf.com +corporatefinanceinstitute.com +hh010.com +wsimg.com +deviantart.net +unibo.it +plywoodenchant.com +nflximg.net +toyokeizai.net +go2cloud.org +brightspace.com +itslearning.com +gleam.io +icrc.org +muffingroup.com +webopedia.com +thumbtack.com +qoo10.jp +sheffield.ac.uk +capgemini.com +casetify.com +scitation.org +caixabank.es +technet.com +freemake.com +telugu360.com +wmo.int +williams-sonoma.com +shifen.com +electrek.co +syfy.com +mercadopago.com.ar +jsfiddle.net +prevention.com +auckland.ac.nz +setare.com +alaska.edu +spacex.com +v-mate.mobi +geistm.com +nhm.ac.uk +repec.org +utwente.nl +president.gov.ua +getcourse.ru +okstate.edu +vedomosti.ru +monbin.site +kraken.com +wordhippo.com +edgecastcdn.net +saednews.com +ahrq.gov +csoonline.com +zamzar.com +gamasutra.com +documentforce.com +mymodernmet.com +harrods.com +obsproject.com +admitad.com +magazineluiza.com.br +eweek.com +netlify.com +suicidepreventionlifeline.org +cbr.com +gameforge.com +cncn.org.cn +rijksoverheid.nl +hrloo.com +nutrition.org +statesman.com +appboy.com +amtrak.com +lacounty.gov +sage.com +playstation.net +ornl.gov +rki.de +westpac.com.au +gao7.com +yaplakal.com +gsxt.gov.cn +blogsky.com +dxy.cn +smallseotools.com +mojifen.com +alistapart.com +visymo.com +bicentenariobu.com +fragrantica.com +skrill.com +ces.tech +pbc.gov.cn +magodasimagens.com.br +tencent-cloud.net +20min.ch +masterpapers.com +csulb.edu +kent.edu +storeboard.com +cognitivlabs.com +agenziaentrate.gov.it +krebsonsecurity.com +microsoft.net +soton.ac.uk +tcd.ie +bundestag.de +ringcentral.com +testbook.com +worldoftanks.ru +nintendo.net +bozhong.com +sphinx-doc.org +ichano.com +cian.ru +hsbc.com +macmillan.com +openlibrary.org +nrich.ai +googleoptimize.com +gismeteo.ua +miniclip.com +radio-canada.ca +rikunabi.com +www.gov.pl +xuexi.cn +smartclip.net +deseretnews.com +eporner.com +uni-hamburg.de +rsf.org +svt.se +gsa.gov +mobirise.info +ko-fi.com +cityu.edu.hk +rs-online.com +univision.com +aircanada.com +licindia.in +index-education.net +icloud.com.cn +friv.com +99designs.com +freshbooks.com +fairmont.com +cbinsights.com +delgarm.com +jeuxvideo.com +ampproject.net +bestwestern.com +newtalk.tw +art.pl +eventbrite.ca +wtop.com +uproxx.com +laopm.com +callofduty.com +jxmall.com +healthgrades.com +pewtrusts.org +nai.com +cisa.gov +pinterest.com.au +nationaltrust.org.uk +digitalspy.com +locationsreverenceaid.com +mn.gov +storygize.net +fishki.net +microsoftazuread-sso.com +goibibo.com +dlsite.com +newsit.gr +openlearning.com +wrike.com +thalesgroup.com +multiurok.ru +consumeraffairs.com +payforessay.net +torrentfreak.com +deadspin.com +hik-connect.com +smbc.co.jp +etonline.com +prada.com +gothamist.com +cba.pl +doramatv.live +playground.xyz +sunsu521.com +isi.edu +wvu.edu +mi-img.com +anticheatexpert.com +steinberg.net +smm.cn +amazonalexa.com +airbus.com +dupont.com +humanesociety.org +postimages.org +verywellfit.com +duniagames.co.id +directredirection.com +academic.ru +ipify.org +16personalities.com +weightwatchers.com +medicaldaily.com +wisegeek.com +revopush.com +webcindario.com +douyincdn.com +dv37.com +findagrave.com +commentcamarche.net +hackerrank.com +usabilla.com +pdf2go.com +healthychildren.org +le.ac.uk +bbwhf.com +doaj.org +mseav.com +galacticmenueasier.com +tomtom.com +theathletic.com +johnlewis.com +transparency.org +ucsusa.org +annualreviews.org +brownpapertickets.com +zingmp3.vn +jnj.com +netteller.com +which.co.uk +rai.it +nyti.ms +vodafone.de +fx678.com +pocket-lint.com +viadeo.com +motortrend.com +knightlab.com +spreadshirt.com +delltechnologies.com +crisp.chat +dailywire.com +csic.es +aralego.net +wpbeginner.com +caichongwang.com +wral.com +pagina12.com.ar +raider.io +iucn.org +sublimetext.com +doorblog.jp +audacityteam.org +corporate-ir.net +wowkorea.jp +mhthemes.com +google.si +sierraclub.org +patheos.com +corel.com +alfavita.gr +xoom.com +ucar.edu +carleton.ca +tbs.co.jp +petfinder.com +thediplomat.com +hse.gov.uk +imgflip.com +doc.gov +opensecrets.org +diplo.de +dmxleo.com +haveibeenpwned.com +beinsports.com +gnavi.co.jp +adtdp.com +zxxk.com +popcash.net +wisconsin.gov +polldaddy.com +unext.jp +talkingpointsmemo.com +ikco.ir +ga.gov +sfsu.edu +pelisplushd.net +ithome.com +g2a.com +lyst.com +sagawa-exp.co.jp +canadiantire.ca +web.com +wkzuche.com +lequipe.fr +uni-koeln.de +argos.co.uk +vidazoo.com +xuite.net +hupu.com +hu-berlin.de +de17a.com +phonearena.com +iarc.fr +choicehotels.com +mizuhobank.co.jp +domain.com +ghostery.com +dur.ac.uk +subway.com +mosaiquefm.net +loewe.com +project-syndicate.org +japantoday.com +tubecup.net +sparkfun.com +webflow.com +ssllabs.com +mint.com +prlog.org +unacademy.com +oppo.com +fanlibang.com +ebaumsworld.com +uottawa.ca +aruba.it +las2orillas.co +clarium.io +wantedly.com +dal.ca +grammy.com +health.gov.au +hudong.com +icons8.com +7gz.com +roboform.com +google.com.qa +sjsu.edu +worldatlas.com +eleconomista.es +secu100.com +gemius.pl +moengage.com +buffalonews.com +trezor.io +beauty321.com +bigolive.tv +hackaday.com +template.net +px-cloud.net +tineye.com +linode.com +mangakakalot.com +yfrog.com +lin.ee +tabnak.ir +flagcounter.com +saic.gov.cn +fontawesome.io +ooopic.com +adlightning.com +ugent.be +adme.ru +pennlive.com +daserste.de +wyborcza.pl +weddingwire.com +99acres.com +books.com.tw +ilmeteo.it +getgo.com +concordia.ca +redgifs.com +unctad.org +miwifi.com +yidianzixun.com +uplynk.com +extendthemes.com +kbs.co.kr +mozillazine.org +4399.com +haber7.com +hsforms.com +4pda.ru +ole.com.ar +srf.ch +geforce.com +acast.com +provincial.com +speedyloan.net +commonapp.org +techtudo.com.br +grist.org +gale.com +qvc.com +playbuzz.com +no-ip.org +regulations.gov +wwe.com +etsystatic.com +tucows.com +myapp.com +pcpartpicker.com +collegehumor.com +jpn.org +maine.edu +exacttarget.com +moe.gov.sa +google.tn +villagevoice.com +microad.jp +northjersey.com +verajohn.com +linksys.com +5i8xkqjmqubv.top +nabble.com +vimeocdn.com +sascdn.com +batds.net +post.ir +publishersweekly.com +epik.com +pewsocialtrends.org +dmkt-sp.jp +greatist.com +umt.edu +mparticle.com +jobvite.com +exploratorium.edu +potterybarn.com +innity.com +movember.com +tv-tokyo.co.jp +ensighten.com +trustx.org +hea.cn +tommy.com +karger.com +christies.com +tongdun.net +olx.com.eg +admicro.vn +mohurd.gov.cn +rmit.edu.au +rooziato.com +mynewsdesk.com +simplemachines.org +bristol.ac.uk +ardmediathek.de +klm.com +mobirise.com +nbg.gr +clickagy.com +theweathernetwork.com +pravda.ru +petitiononline.com +wdc.com +btloader.com +nrw.de +cochrane.org +pathofexile.com +yourstory.com +merchantcircle.com +belfasttelegraph.co.uk +jinshuju.net +uploaded.net +ladsp.jp +dev.to +xueqiu.com +appdynamics.com +ushmm.org +hln.be +rapidtables.com +nec.com +excite.com +autonavi.com +guidechem.com +montrealgazette.com +rivals.com +betanews.com +credit-suisse.com +infoq.com +e2ma.net +secureinternetbank.com +housebeautiful.com +te.eg +pen.io +mirtesen.ru +wri.org +picuki.com +gdz.ru +tu-berlin.de +sonypictures.com +eroterest.net +dg-datenschutz.de +pearsonvue.com +regnum.ru +bankbazaar.com +gridserver.com +graphicriver.net +jimdosite.com +jcrew.com +tbcache.com +getsatisfaction.com +csun.edu +us-cert.gov +taz.de +abc7news.com +tom.com +vidal.fr +sundaysky.com +elcorteingles.es +contently.com +guancha.cn +cnr.it +uwa.edu.au +monash.edu.au +appannie.com +gameinformer.com +elbotola.com +umbc.edu +ap.gov.in +masslive.com +superbthemes.com +intercomcdn.com +uni-bonn.de +ahnlab.com +nar.realtor +i.ua +trafficfactory.biz +jagonews24.com +publpush.com +rutor.info +argentina.gob.ar +uschamber.com +colourlovers.com +utm.edu +dailyherald.com +cgtn.com +bumlam.com +libguides.com +httpwg.org +windowsreport.com +reactjs.org +cyberleninka.ru +xitek.com +timesnownews.com +nrf.com +tor.com +periscope.tv +urldefense.com +56.com +9876ydd.com +01net.com +test.com +opendemocracy.net +warriorplus.com +walesonline.co.uk +paho.org +yesky.com +belkin.com +christianitytoday.com +ficbook.net +marxists.org +zulily.com +samsunghealth.com +intipseleb.com +winamp.com +random.org +fandango.com +bundesregierung.de +askmen.com +load24.biz +ssense.com +nbclosangeles.com +onlinehome.us +tokyo.lg.jp +splcenter.org +activision.com +wsdvs.com +bdnews24.com +at.ua +iht.com +tue.nl +upm.es +techacademy.jp +blastcahs.com +ofhappinyer.com +adobess.com +businesstoday.com.tw +xg4ken.com +iloveimg.com +macleans.ca +dynamics.com +directv.com +amazon.sa +liverpoolecho.co.uk +moe.edu.cn +blibli.com +simonandschuster.com +lexology.com +imotech.tech +odu.edu +basf.com +gos-gsp.io +kontur.ru +msn.cn +autonews.com +pagesjaunes.fr +nationalinterest.org +pirate-bay.net +segmentfault.com +rtbf.be +archives-ouvertes.fr +powerlinks.com +niuche.com +piriform.com +vademecum.es +shef.ac.uk +biz.ua +hrblock.com +8tracks.com +theringer.com +mgmresorts.com +wonderhowto.com +techinasia.com +jugem.jp +ionos.de +physiology.org +im-apps.net +namesilo.com +worksmobile.com +programiz.com +nh.gov +audubon.org +xml-sitemaps.com +irecommend.ru +cloudflareinsights.com +komonews.com +sonos.com +navyfederal.org +tulane.edu +lapresse.ca +thomasnet.com +fout.jp +hover.com +nessma.tv +sling.com +lihkg.com +hse.ru +acronis.com +gizmag.com +fdic.gov +yudu.com +nxp.com +matchesfashion.com +avgle.com +palmbeachpost.com +aetna.com +localbitcoins.com +arcot.com +iranserver.com +e-estekhdam.com +star-telegram.com +ctfassets.net +sbicard.com +uwm.edu +etnet.com.hk +dreamhosters.com +escapefromtarkov.com +depaul.edu +annualcreditreport.com +teslamotors.com +pro-market.net +tsyndicate.com +shinobi.jp +computerbild.de +fb.watch +repl.it +vrt.be +permutive.com +adorama.com +chevrolet.com +square-enix.com +tirto.id +rundsp.com +omny.fm +bounceexchange.com +magcloud.com +connexity.net +msocsp.com +citibankonline.com +anz.com +sz.gov.cn +runescape.wiki +microfocus.com +xtgem.com +rarathemes.com +madmimi.com +l-msedge.net +brobible.com +microchip.com +dict.cc +lumenlearning.com +linux.com +edmodo.com +guifun.com +symbolab.com +worldfcdn.com +courier-journal.com +domain.com.au +mts.ru +hol.es +tnt.com +court.gov.cn +sportzwiki.com +carfax.com +liveabout.com +iconfinder.com +awin.com +nuance.com +cbn.com +aol.de +onliner.by +ofweek.com +plannedparenthood.org +olx.ro +pch.com +unbounce.com +flightaware.com +bitmoji.com +ntv.ru +eastdane.com +daveramsey.com +bloombergquint.com +g2.com +gitee.com +qianlong.com +tinypng.com +milb.com +pymnts.com +piojm.tech +yahoo.com.tw +lianjia.com +thedrive.com +reverb.com +emol.com +surrey.ac.uk +neoldu.com +polimi.it +saude.gov.br +hongkiat.com +gigafile.nu +appsto.re +adn.com +newsmth.net +parade.com +almaany.com +samsungqbe.com +fumail.de +ri.gov +ria.com +devpost.com +sumo.com +rian.ru +google.com.uy +mikrotik.com +ackcdn.net +gcloudcs.com +vu.nl +eqads.com +vlive.tv +promotional-concepts.de +visitbeijing.com.cn +law360.com +viator.com +lasvegassun.com +fujitv.co.jp +mohrss.gov.cn +nation2.com +kapook.com +madison.com +ftchinese.com +wmgtr.com +labirint.ru +justwatch.com +uoguelph.ca +mtu.edu +koreaherald.com +factcheck.org +aftership.com +spotxcdn.com +wallethub.com +baike.com +zohopublic.com +weixinyunduan.com +e-taxes.gov.az +rarlab.com +jamieoliver.com +wetter.com +rr.com +turbobit.net +google.jo +flyme.cn +diabetesjournals.org +sharepointonline.com +googlezip.net +thenounproject.com +sm.cn +earthday.org +lenovo.com.cn +mysanantonio.com +emag.ro +pof.com +nd.gov +sfweekly.com +jdsports.com +blackmagicdesign.com +tanx.com +smartthings.com +expansion.com +rbxcdn.com +huji.ac.il +sankei.com +elementor.com +the-scientist.com +militarytimes.com +oas.org +ccgp.gov.cn +onamae.com +player.fm +sendgrid.com +meltwater.com +gu.se +dothome.co.kr +counterpunch.org +emptyhammock.com +seattle.gov +liveperson.net +nine.com.au +tensorflow.org +carwale.com +duba.com +9to5google.com +forever21.com +twoo.com +7k7k.com +ttu.edu +birminghammail.co.uk +pptv.com +lung.org +lge.com +nation.africa +db.com +adtechus.com +freenet.de +pressherald.com +randomhouse.com +europeana.eu +glosbe.com +avvo.com +lightinthebox.com +ethereum.org +sucuri.net +ushareit.com +pg.com +ddns.net +xmsecu100.net +triblive.com +weidian.com +qmul.ac.uk +jamendo.com +townandcountrymag.com +ename.com +makeleio.gr +coolors.co +brother.com +wur.nl +sii.cl +friendfeed.com +balenciaga.com +nazwa.pl +ahram.org.eg +fun48.com +streamyard.com +parsely.com +kuaishou.com +serverfault.com +js.org +abplive.com +hrc.org +simplecast.com +filmaffinity.com +tuoitre.vn +gotporn.com +usajobs.gov +twincities.com +videoplayerhub.com +tomsk.ru +bris.ac.uk +ctobsnssdk.com +desdev.cn +masrawy.com +cmcm.com +linux.org +xinmin.cn +foodnetwork.co.uk +umanitoba.ca +astm.org +tiu.ru +crazygames.com +sfexaminer.com +azurefd.net +google.org +sans.org +dtscout.com +diamond.jp +techopedia.com +telekom.de +taringa.net +presslogic.com +zety.com +banamex.com +bao315.com +uark.edu +nbcbayarea.com +drtuber.com +ambafrance.org +localytics.com +nga.gov +vermont.gov +edutopia.org +mihanblog.com +lockerdome.com +transunion.com +aei.org +nyse.com +blackplanet.com +multiply.com +hs-scripts.com +timesofmalta.com +rockcontent.com +fanpop.com +king5.com +digikey.com +yougov.com +greatschools.org +taylorandfrancis.com +elyamnelaraby.com +miaozhen.com +ad-delivery.net +uk.net +dndbeyond.com +nc.gov +pstatic.net +khabaronline.ir +au.dk +pronews.gr +warriorforum.com +dutchtracking.nl +customs.gov.cn +9game.cn +nmsu.edu +nintendo.co.jp +eurobricks.com +imgbb.com +bbt.com +spotifycdn.com +ck12.org +cognitoforms.com +mathsisfun.com +westelm.com +llnwd.net +whois.com +ulaval.ca +thejournal.ie +hebnews.cn +compuserve.com +youradchoices.ca +paylocity.com +bbci.co.uk +albany.edu +bain.com +gobizkorea.com +computer.org +cedexis.com +feedbooks.com +hs-banner.com +fullstory.com +belgium.be +logi.com +hostinger.com +maribacaberita.com +medpagetoday.com +bankofengland.co.uk +springeropen.com +deccanherald.com +rp-online.de +brainpickings.org +fetlife.com +ryerson.ca +rockpapershotgun.com +hs-analytics.net +kalerkantho.com +mixdrop.co +rebrand.ly +jiji.com +pangle.io +virgin.com +tiny.cloud +iowa.gov +cebnet.com.cn +ufrj.br +dashlane.com +poets.org +rbc.com +podio.com +sci-hub.se +uni-freiburg.de +mattel.com +razorpay.com +haraj.com.sa +wacom.com +jetblue.com +underarmour.com +brandeis.edu +break.com +tu-dresden.de +copyblogger.com +filmibeat.com +airasia.com +aftodioikisi.gr +sil.org +bankcomm.com +hrsa.gov +luc.edu +healthaffairs.org +fontsquirrel.com +backlinko.com +articulate.com +viifax.com +cleantechnica.com +neea.edu.cn +auswaertiges-amt.de +ntv.com.tr +csis.org +bitrix24.ru +bethesda.net +olark.com +secureservercdn.net +schneider-electric.com +lululemon.com +fortinet.com +bytedance.com +genial.ly +thomann.de +blick.ch +justpremium.com +slides.com +keio.ac.jp +immobilienscout24.de +giantbomb.com +cinarra.com +conta.cc +cybersource.com +23andme.com +mediamarkt.de +echoroukonline.com +allthingsd.com +unibe.ch +mundosexanuncio.com +hostelworld.com +nice.org.uk +mbc.net +honey.io +haoyer.com +offerimage.com +geek.com +wjx.cn +online.fr +telerik.com +freedesktop.org +qut.edu.au +sc.gov +sejda.com +networking.apple +ivoox.com +calm.com +tenki.jp +xuexila.com +cyberchimps.com +readwrite.com +telegraaf.nl +gouvernement.fr +techpowerup.com +online-metrix.net +coocan.jp +emc.com +9news.com +sussex.ac.uk +nat.gov.tw +whu.edu.cn +citibank.com +feishu.cn +dailynews.com +du.edu +tyc.edu.tw +gva.es +hltv.org +cardekho.com +opentracker.xyz +aepd.es +byteoversea.net +thepetitionsite.com +thedonald.win +faceit.com +bseindia.com +iupui.edu +anl.gov +thechive.com +bt.cn +levi.com +pixieset.com +iom.int +highsnobiety.com +adac.de +food52.com +thoughtcatalog.com +hqporner.com +utdallas.edu +freesound.org +editorialmanager.com +latercera.com +adswizz.com +freeprivacypolicy.com +dislanelibrar.top +classdojo.com +ernet.in +shorturl.at +lanzous.com +umengcloud.com +thefederalist.com +woothemes.com +bouncex.net +publicradio.org +coupons.com +securecafe.com +webatam.com +depositfiles.com +arabi21.com +webrootcloudav.com +list.ly +reson8.com +dilbert.com +irfanview.com +jma.go.jp +yicai.com +wm.edu +dreamwidth.org +quicksprout.com +direct.gov.uk +tiaomu.com +lovetoknow.com +aacrjournals.org +yimg.jp +singular.net +adx1.com +shopifysvc.com +acfun.cn +oxfam.org +dagbladet.no +malwarebytes.org +health.gov +tuko.co.ke +snu.ac.kr +wroc.pl +filmkovasi.org +178.com +newsbreak.com +telstra.com.au +cjr.org +jwpsrv.com +ferrari.com +exe.io +usyd.edu.au +bose.com +kathimerini.gr +act.org +kartra.com +mozit.cloud +passportindia.gov.in +bloglines.com +nagaswap.org +niche.com +qidian.com +bell.ca +atom.io +glaz.tv +charitynavigator.org +netbk.co.jp +join.chat +nsc.org +caringbridge.org +21cn.com +christianpost.com +sbixby.com +kapitalbank.az +celine.com +redis.io +audiomack.com +leagueofgraphs.com +powerschool.com +harley-davidson.com +aawsat.com +ksosoft.com +mongabay.com +jahannews.com +adl.org +bloomingdales.com +google.ba +tdscpc.gov.in +feedingamerica.org +hotmo.org +stockcharts.com +thelocal.se +livedoor.biz +unece.org +hellomagazine.com +sahamyab.com +netbeans.org +nordstromrack.com +ripe.net +interactivebrokers.com +720yun.com +ismedia.jp +rthk.hk +alaskaair.com +mq.edu.au +buzzsumo.com +stripes.com +onlinemektep.org +treasuredata.com +guggenheim.org +clien.net +camfrog.com +fubo.tv +index.hr +beforeitsnews.com +pjmedia.com +dw-world.de +daimler.com +zj.gov.cn +mymovies.it +pusher.com +ipage.com +all-free-download.com +toshiba.com +lanl.gov +nbcwashington.com +camsloveaholics.com +alternativeto.net +mobilesystemservice.com +stonybrook.edu +irishexaminer.com +unlv.edu +bath.ac.uk +transbank.cl +sendo.vn +le360.ma +ef.com +powells.com +xiazaiba.com +spotim.market +parcelsapp.com +otomoto.pl +travelocity.com +de.vu +centurylink.com +bayt.com +gumtree.com +wework.com +indymedia.org +thehackernews.com +manutd.com +obozrevatel.com +rencaijob.com +theinquirer.net +expressen.se +ust.hk +chocolateplatform.com +yahoosmallbusiness.com +alfabank.ru +bizographics.com +cmegroup.com +hulu.jp +wykop.pl +downdetector.com +datatables.net +care.com +innity.net +bjx.com.cn +dazn.com +tvnz.co.nz +gayboystube.com +nearpod.com +modesens.com +cheqzone.com +web.dev +newschool.edu +hi5.com +roll20.net +livetv.sx +ustr.gov +sinoptik.ua +nbcchicago.com +imoim.app +khaberni.com +crn.com +msstate.edu +freeservers.com +mediavine.com +virtual.edu.az +questionpro.com +dokuwiki.org +simplyhired.com +scotland.gov.uk +thenews.com.pk +termsfeed.com +unr.edu +beniculturali.it +inews.co.uk +prensa-latina.cu +akurat.co +swr.de +uni-saarland.de +ibps.in +c2.com +onlinekhabar.com +lucidchart.com +canoe.ca +yalla-shoot.com +nginx.net +transandfiestas.ga +rogerebert.com +brightbrides.net +llink.site +zb.com +sonyentertainmentnetwork.com +sonymobile.com +concursolutions.com +edf.fr +springserve.com +hurriyetdailynews.com +qhimg.com +mxpnl.com +apxlv.com +thanhnien.vn +hankyung.com +uri.edu +packtpub.com +aarth.net +duomai.com +ok.gov +bancosantander.es +lublin.pl +mediamatters.org +basketball-reference.com +tigris.org +cutt.us +inep.gov.br +legalzoom.com +ralphlauren.com +up.pt +unenvironment.org +tanea.gr +site.com +1dmp.io +streetinsider.com +santander.com.ar +upworthy.com +courthousenews.com +sportbox.ru +surveymonkey.co.uk +autismspeaks.org +q4cdn.com +ssg.com +neowin.net +ukri.org +navercorp.com +psmag.com +nhc.gov.cn +au.com +ad.nl +linuxmint.com +synxis.com +google.com.sv +swarthmore.edu +rp5.ru +linuxfoundation.org +stores.jp +urban.org +filmmodu.org +nau.edu +redditmedia.com +fpdf.org +yandex.kz +37signals.com +asiaone.com +us.es +photo-ac.com +alpha.gr +talent.com +sawbrokers.com +webstarts.com +nikkansports.com +grademiners.com +whatismyip.com +webgains.com +texastribune.org +aade.gr +wikimediafoundation.org +walla.co.il +epi.org +poloniex.com +demandbase.com +chipotle.com +montana.edu +5w52.com +rubyonrails.org +accesspressthemes.com +ottawacitizen.com +wildapricot.org +questia.com +omaha.com +dnspod.cn +ewaybillgst.gov.in +qunar.com +animevnn.com +stoloto.ru +freepdfconvert.com +transportation.gov +garanteprivacy.it +iqilu.com +orbitz.com +frontapp.com +skyscanner.net +ntnu.no +barchart.com +uned.es +uidaho.edu +serverbid.com +ac-illust.com +jusbrasil.com.br +vsco.co +newsru.com +intechopen.com +netpnb.com +all4webs.com +videohub.tv +pewforum.org +google.com.mm +123milhas.com +ice.gov +newsok.com +onedio.com +coremail.cn +southernliving.com +cryptotabbrowser.com +dlink.com +moneysavingexpert.com +intercom.com +mindspring.com +eol.cn +carriersignal.info +abc7chicago.com +wordfence.com +ctv.ca +ksu.edu.sa +cultofmac.com +iau.ac.ir +ada.gov +jqueryui.com +amung.us +wonderplugin.com +duosecurity.com +blubrry.com +wmtransfer.com +fresherslive.com +slashfilm.com +uchile.cl +libgen.rs +ckeditor.com +ksord.com +xvideos-cdn.com +squarespace-cdn.com +mdr.de +ttvnw.net +aeon.co +timeslive.co.za +unfpa.org +cmail20.com +venngage.com +blm.gov +psychologicalscience.org +muckrack.com +adliran.ir +swarovski.com +gittigidiyor.com +sothebys.com +imagebam.com +indiapost.gov.in +laptopmag.com +hertz.com +merck.com +volkskrant.nl +tok2.com +raiplay.it +oaspapps.com +forbes.ru +sunat.gob.pe +abc7ny.com +scu.edu +abola.pt +open.edu +fextralife.com +bizcommunity.com +caesars.com +bittrex.com +ugr.es +journaldunet.com +eurobank.gr +mozdev.org +dropbox-dns.com +rtl.de +upf.edu +cloudways.com +jdsports.co.uk +shapeways.com +cognizant.com +tweakers.net +gazzetta.gr +educause.edu +clicky.com +uni-frankfurt.de +uni-stuttgart.de +scienceblogs.com +cpx.to +togetter.com +duden.de +3ds.com +globalsecurity.org +wiocha.pl +ecowatch.com +pinterest.ru +tcs.com +ilfattoquotidiano.it +umu.se +google.ee +cogocast.net +photo.net +rogers.com +youla.ru +typingclub.com +xanga.com +clubic.com +deutsche-bank.de +screencast-o-matic.com +8m.com +daumcdn.net +24s.com +radiotimes.com +bannersnack.com +mizbanfa.net +auth.gr +uts.edu.au +hjenglish.com +alaraby.co.uk +panorama.com.al +sportradar.com +lodz.pl +trueleadid.com +lpages.co +google.com.np +kingston.com +sweetwater.com +dow.com +fontspace.com +chicagobusiness.com +dice.com +kerala.gov.in +creative.com +elfinanciero.com.mx +pcauto.com.cn +maybank2u.com.my +cafef.vn +themify.me +sc-gw.com +jhsph.edu +wordpressfoundation.org +tianyancha.com +oscars.org +deref-gmx.net +toronto.ca +suite101.com +psychiatryonline.org +nii.ac.jp +coca-colacompany.com +uhc.com +st.com +betfair.com +questrade.com +jmty.jp +blogfreely.net +southampton.ac.uk +getfvid.com +bis.org +oricon.co.jp +spectrum.com +createspace.com +spoti.fi +thinglink.com +mtlnovel.com +hypermart.net +commerzbank.de +wetteronline.de +ascd.org +rollcall.com +ae.com +adb.org +bottegaveneta.com +paychex.com +idealmedia.io +bd-pratidin.com +campaignlive.co.uk +freefiremobile.com +91mobiles.com +pgatour.com +yunaq.com +filgoal.com +allhugefeed.com +curtin.edu.au +camp-fire.jp +arkansas.gov +mid.ru +sail-horizon.com +nrel.gov +telugustop.com +deutschlandfunk.de +outlookindia.com +swiftkey.com +sitejabber.com +google.com.cu +ciscospark.com +jalan.net +bravotv.com +haomaner.com +bamboohr.com +servenobid.com +sportsurge.net +blogimg.jp +tv-asahi.co.jp +webofknowledge.com +vzwwo.com +theodysseyonline.com +vaticannews.va +lockheedmartin.com +optimizesrv.com +huamu.cn +114la.com +nju.edu.cn +quovadisglobal.com +rafflecopter.com +anadolu.edu.tr +hbx.com +exlibrisgroup.com +theroot.com +govtech.com +netmng.com +thedenverchannel.com +fau.edu +propellerads.com +embedly.com +ssisurveys.com +boots.com +richaudience.com +xx3.kz +singaporeair.com +infogr.am +shueisha.co.jp +pydata.org +google.com.gt +tvp.pl +dubizzle.com +mastercard.us +education.com +homeaffairs.gov.au +ic3.gov +iucnredlist.org +tsite.jp +clmbtech.com +yemek.com +transportr.io +noodlemagazine.com +freedownloadmanager.org +google.co.cr +elsalvador.com +thepointsguy.com +consumerist.com +chalmers.se +eetimes.com +programme-tv.net +oneplus.com +zenwriting.net +aaxads.com +caisse-epargne.fr +snapdeal.com +nic.ir +baishan-cloud.net +gamesindustry.biz +aalto.fi +therecipecritic.com +ntv.co.jp +valuecommerce.ne.jp +1push.io +efukt.com +indiebound.org +slidesgo.com +tendawifi.com +asme.org +who.is +rxlist.com +homedepot.ca +snap.com +cq.gov.cn +ynetnews.com +51y5.net +aad.org +510hr.com +unistra.fr +liverpool.com.mx +lpsnmedia.net +abcnews.com +rabbitpre.com +die.net +dion.ne.jp +freedomhouse.org +draxe.com +usg.edu +wz.cz +1mg.com +calpoly.edu +baidustatic.com +thisismoney.co.uk +appfolio.com +vc.ru +freshchat.com +sipo.gov.cn +rzd.ru +later.com +en25.com +barclays.co.uk +michelin.com +cirquedusoleil.com +astrologyanswers.com +brightside.me +ntlworld.com +feebee.com.tw +hiido.com +sporcle.com +k-state.edu +pokemon.com +amzn.com +golem.de +igamecj.com +cafelog.com +tweaktown.com +microcenter.com +leroymerlin.ru +business.gov.au +thenorthface.com +gib.gov.tr +fbdown.net +pair.com +edgecastdns.net +wsbtv.com +kddi.com +arm.com +comixology.com +xmtrading.com +zebra.com +sitesell.com +gigazine.net +userbenchmark.com +bradesco.com.br +alwakeelnews.com +uni-muenster.de +kubernetes.io +dailypost.ng +leafly.com +commerce.gov +magicbricks.com +keepa.com +ee.co.uk +goodmorningamerica.com +doc88.com +novayagazeta.ru +bloomsbury.com +donaldjtrump.com +visme.co +eaton.com +uni-tuebingen.de +reading.ac.uk +hi-ho.ne.jp +linternaute.com +helpscout.net +aleassbun.site +ebay.es +uni-leipzig.de +pajak.go.id +chez.com +milanuncios.com +torgi.gov.ru +louisville.edu +google.com.bo +connectad.io +eiu.com +hket.com +avma.org +picmonkey.com +truthout.org +heraldscotland.com +kinogo.biz +yousendit.com +freeserve.co.uk +sympatico.ca +cnsnews.com +mozilla-europe.org +paulgraham.com +sport-express.ru +ubnt.com +fark.com +gamedog.cn +postandcourier.com +eatright.org +aolcdn.com +elespectador.com +birmingham.ac.uk +fodors.com +mail-archive.com +ency-education.com +abc13.com +53.com +zionschool.info +livehindustan.com +nchsoftware.com +r7.com +upv.es +activecampaign.com +printful.com +instagr.am +duba.net +nsa.gov +japannetbank.co.jp +cert.org +prf.hn +as.me +lookbook.nu +yonhapnews.co.kr +pcgamesn.com +tmcnet.com +gouv.qc.ca +wccftech.com +livemaster.ru +idealista.com +youngjoygame.com +wpscdn.cn +plin.im +winscp.net +lgtvsdp.com +m-w.com +buenotraffic.com +simplilearn.com +shinezone.com +sandiegozoo.org +marketwire.com +reurl.cc +philstar.com +fmovies.to +netlify.app +uea.ac.uk +darkreading.com +gaiaonline.com +uniregistry.com +mcclatchydc.com +vindicosuite.com +flexjobs.com +zap2it.com +sekindo.com +fastpic.ru +corsair.com +bloomberglaw.com +univ-lorraine.fr +taiwannews.com.tw +zhiding.cn +ratemyprofessors.com +citysearch.com +yubico.com +aipai.com +buyersdrive.com +appen.com +sheypoor.com +kapwing.com +chronoengine.com +mca.gov.cn +artofmanliness.com +siamswim.com +interpol.int +cdnbye.com +cardiff.ac.uk +bfi.org.uk +qwant.com +trimble.com +kitapsec.com +costco.ca +uni-mainz.de +cdngslb.com +grainger.com +convertkit.com +finn.no +a9vg.com +finanzen.net +thisamericanlife.org +bab.la +ulule.com +shazam.com +adaa.org +he.net +netafraz.com +eloqua.com +policybazaar.com +gust.com +realme.com +saisoncard.co.jp +monotaro.com +freerepublic.com +lenovomm.com +onlineradiobox.com +meteofrance.com +paper.li +webmoney.ru +cbo.gov +nrc.nl +unicredit.it +lsosad.com +uv.es +arthritis.org +vklass.se +alarab.com +hiend.xyz +avid.com +dailypakistan.com.pk +sinica.edu.tw +llnl.gov +actblue.com +paisabazaar.com +spin.com +ipetitions.com +mktoresp.com +ku6.com +healio.com +armorgames.com +lismcanalys.fun +zacks.com +canalplus.com +justjared.com +olx.co.id +vz.ru +bidtheatre.com +amebaownd.com +filedropper.com +associates-amazon.com +classlink.com +tribuneindia.com +harborfreight.com +helpstart.co.kr +gfan.com +slackb.com +groupme.com +yotpo.com +dtic.mil +nieuwsblad.be +natlawreview.com +jobinja.ir +huffingtonpost.fr +scirp.org +inss.gov.br +googlegroups.com +pngwing.com +trendhunter.com +adda247.com +mafengwo.cn +bancobrasil.com.br +google.com.py +currys.co.uk +ruc.edu.cn +bea.gov +smadex.com +securelist.com +koreastardaily.com +uw.edu.pl +skyscrapercity.com +jssor.com +soumu.go.jp +sme.sk +affiliatelabz.com +nst.com.my +picturepush.com +chocolatey.org +bangordailynews.com +interarbiter.info +okko.tv +quikr.com +quickconnect.to +willhaben.at +su.se +deutschepost.de +joemonster.org +xiti.com +eatingwell.com +mdhv.io +clarivate.com +deakin.edu.au +loopnet.com +gsjyzg.com +ni.com +napster.com +medtronic.com +ruhr-uni-bochum.de +data.gov.uk +lehigh.edu +ganji.com +streamtape.com +forward.com +collegedunia.com +66law.cn +dolcegabbana.com +datsgirl.com +befunky.com +gostaresh.news +wsws.org +cadenaser.com +liquipedia.net +megogo.net +rootsweb.com +mail-order-bride.net +apple-dns.cn +joomshaper.com +sail-personalize.com +cainiao.com +kuwo.cn +shinystat.com +bookmyshow.com +hunan.gov.cn +internet-start.net +load20.biz +giga.de +aaas.org +cabanova.com +perfectmarket.com +mises.org +ath.cx +mext.go.jp +sumitomokenki.com.cn +dnacdn.net +numbeo.com +elibrary.ru +spokesman.com +rnz.co.nz +mailtrack.io +luckyforbet.com +javmodels.info +readwriteweb.com +unaids.org +mediabistro.com +skynewsarabia.com +4.cn +thepiratebay10.org +cedexis.net +nsfc.gov.cn +datingranking.net +rev.com +provsd.info +uzorak.info +99yechang.com +sitemeter.com +idianfa.com +hamburg.de +zmags.com +datenschutz-generator.de +moderus.info +yummly.com +fonts.net +cityam.com +mako.co.il +cathaypacific.com +resetera.com +maisonmargiela.com +knoxnews.com +abv.bg +qantas.com +retargetly.com +snip.ly +betgorebysson.club +lesanimaux.site +fb.ru +eyny.com +ic.ac.uk +gentoo.org +sentinelone.net +apsense.com +theoatmeal.com +niu.edu +redirectingat.com +faucetpay.io +sverigesradio.se +penguin.co.uk +v0cdn.net +quzke.com +pingan.com +dtu.dk +fujifilm.com +yandex.by +isabellacharms.xyz +meethue.com +larousse.fr +cleanpng.com +calgaryherald.com +fotostrana.ru +oboporn.com +scamadviser.com +mind.org.uk +desktopnexus.com +exirbroker.com +findlaw.cn +byted.org +aminoapps.com +optimum.net +oboteen.com +theiet.org +price.com.hk +iq-servers.com +osd.mil +ihsmarkit.com +sbi.co.in +ldoceonline.com +google.cm +universetoday.com +amarujala.com +tsn.ca +vietcombank.com.vn +game8.jp +phncdn.com +mps.gov.cn +gleb.website +hefteaz.info +cxense.com +saramin.co.kr +mpaa.org +reg.ru +w3layouts.com +united-domains.de +my.gov.au +ecfr.gov +newswise.com +lexcat.info +hkprice.info +hw.ac.uk +ubuntuforums.org +jotformeu.com +ebsco.com +matamata.com +avacharms.xyz +laprensagrafica.com +joomag.com +azertag.az +ccn.com +588ku.com +gq-magazine.co.uk +audiojungle.net +cmbc.com.cn +cairn.info +adtng.com +brainpop.com +iinet.net.au +analog.com +stackpathcdn.com +beckershospitalreview.com +easyasvpn.com +linguee.fr +financialcontent.com +finalfantasyxiv.com +wakelet.com +samanage.com +babbel.com +cafebazaar.ir +gammaplatform.com +mixi.jp +lowyat.net +diynetwork.com +autotrader.co.uk +liberty.edu +petco.com +topshop.com +feedspot.com +infusionsoft.app +elbaestes.pro +edocr.com +ctan.org +deref-web.de +iadb.org +u.to +osce.org +hubblesite.org +s-msn.com +oilprice.com +uni-goettingen.de +tapjoyads.com +ceneo.pl +viddler.com +csfd.cz +nbcphiladelphia.com +koolearn.com +liepin.com +dersyndikalist.info +anz.com.au +logmeininc.com +verywellfamily.com +ysl.com +uaf.edu +50megs.com +itsmyurls.com +blogspot.be +mg.co.za +movavi.com +galinka.info +google.com.bh +neurology.org +eso.org +writeablog.net +bangbros.com +inbox.lv +heraldtribune.com +arcor.de +dailythanthi.com +thairath.co.th +donanimhaber.com +gtbank.com +mapbar.com +wmich.edu +newspapers.com +builtwith.com +enterprise.com +cam4.com +tenpay.com +beginnersmind.info +upc.edu +fsc.org +binghamton.edu +opera.software +hoyendieta.info +cuisineandhealth.site +cafemom.com +energysector.website +basalam.com +hollywoodlife.com +mcall.com +snazzymaps.com +jiangsu.gov.cn +umkc.edu +chinanews.com.cn +english-heritage.org.uk +unipd.it +carmax.com +navdmp.com +toast.com +unimi.it +bsigroup.com +uta.edu +nbcdfw.com +google.cat +tcpdf.org +interestingengineering.com +freepeople.com +tvn24.pl +opensource.com +hpplay.cn +cyberlink.com +sennheiser.com +habitat.org +calculatorsoup.com +mydealz.de +duplichecker.com +marianacastromoreira.com +homeip.net +custom-roms.com +alabama.gov +imagevenue.com +google.lu +europe1.fr +mrpdata.net +nikkei.co.jp +youcanbook.me +lib.ru +slidesharecdn.com +amc.com +kingtime.jp +dwcdn.net +ruby-lang.org +mk.co.kr +globes.co.il +medallia.com +mythemeshop.com +soy502.com +biccamera.com +spaggiari.eu +google.com.lb +tucson.com +bochk.com +yangkeduo.com +aicpa.org +intoday.in +thedailyfunkclub.com +unicamp.br +dn.se +xvideos5.com +taxfoundation.org +yaml.org +xcar.com.cn +aiv-delivery.net +qhnmdb.com +clipwatching.com +thinkphp.cn +gatesnotes.com +gssprt.jp +mql5.com +techadvisor.co.uk +cachefly.net +abdn.ac.uk +afisha.ru +centos-webpanel.com +takepart.com +larepublica.pe +buy123.com.tw +123formbuilder.com +bewusstsein-events.info +aeaweb.org +mxtoolbox.com +powtoon.com +futura-sciences.com +defensenews.com +formsite.com +richmond.com +53kf.com +copyscape.com +ut.ac.ir +samsungelectronics.com +salary.com +hackerone.com +cityheaven.net +healthdata.org +government.ru +conviva.com +swiggy.com +almanac.com +hyundai.com +kickasstorrents.to +khn.org +banco.bradesco +indigo.ca +gongchang.com +liuxue86.com +impactradius-event.com +hani.co.kr +madrid.org +imgsite.net +cntd.ru +openclassrooms.com +elmogaz.com +cmail19.com +newyorkfed.org +travelchannel.com +bestbrides.org +pin.it +steamcontent.com +wakwak.com +brandwatch.com +adpopblocker.com +darpa.mil +lilly.com +mediav.com +vim.org +fotki.com +postmates.com +xdrig.com +vwo.com +actionnetwork.org +xozilla.com +dede58.com +btcbay.net +webroot.com +bihar.gov.in +elecfans.com +dazeddigital.com +aftenposten.no +sandai.net +google.com.kh +aucfan.com +11alive.com +txdot.gov +huxiu.com +zscaler.com +bungie.net +bkrtx.com +geoedge.be +perezhilton.com +dailysabah.com +royanews.tv +acx.com +google.com.et +louisiana.gov +ixl.com +ceskatelevize.cz +gamerant.com +cqhot.cn +interpark.com +flock.com +mouser.com +e-msedge.net +playboy.com +oberlo.com +pluto.tv +uni-hannover.de +gda.pl +yemeksepeti.com +catb.org +arga-mag.com +mprnews.org +idqqimg.com +1und1.de +ccdi.gov.cn +tripadvisor.ca +pro-football-reference.com +ncaa.org +bham.ac.uk +gocomics.com +tube8.com +zhongguowangshi.com +nwf.org +cookieconsent.com +tealiumiq.com +rstyle.me +martinfowler.com +familyhandyman.com +macworld.co.uk +grifo210.com +ufsc.br +tovima.gr +chartboost.com +pianmenw.com +gamewith.jp +tldp.org +easports.com +postheaven.net +orange.com +neu.edu +bpi.ir +myrecipes.com +librarything.com +wordwall.net +ableton.com +smi2.ru +amp.dev +ninemsn.com.au +jamfcloud.com +good.is +qy.net +redstate.com +yifysubtitles.org +samsungpositioning.com +clustrmaps.com +themehorse.com +csc.gov.in +drift.com +polyvore.com +unu.edu +rbth.com +sodapdf.com +thecrimson.com +prospect.org +thebestgame2020.com +delfi.lt +uniroma1.it +zanox.com +bootcss.com +brit.co +rose-brides.com +moppy.jp +fastweb.com +vox-cdn.com +paycomonline.net +empowher.com +ru.nl +usenix.org +ixbt.com +choosemyplate.gov +freepatentsonline.com +chuansong.me +ucas.com +wallpapercave.com +fgov.be +youracclaim.com +thedailystar.net +rtmark.net +microsofttranslator.com +bajajfinserv.in +businessdictionary.com +afi-b.com +people-press.org +byteimg.com +swagtraffcom.com +collective-buyer.com +chinaacc.com +freshmeat.net +winehq.org +mycima.video +tes.com +geoadnxs.com +kent.ac.uk +zenodo.org +acog.org +almasryalyoum.com +vitkac.com +ams.org +f6s.com +distcache.org +az.gov +yximgs.com +delfi.lv +steam-chat.com +foresee.com +wnycstudios.org +metal-archives.com +unbouncepages.com +ionicframework.com +anvato.net +etymonline.com +uow.edu.au +shangri-la.com +nirsoft.net +nextdirect.com +ck-ie.com +physorg.com +coca-cola.com +tinymce.com +ouo.press +tripadvisor.fr +peopledaily.com.cn +hetzner.com +csrc.gov.cn +mec.gov.br +datingmentor.org +leadpages.net +1001fonts.com +keywordtool.io +myswitzerland.com +clickz.com +jugantor.com +topgear.com +avocet.io +material.io +universityofcalifornia.edu +asics.com +mun.ca +baseball-reference.com +rotoworld.com +grabtaxi.com +bnnbloomberg.ca +tribpub.com +forter.com +gifshow.com +spotx.tv +opensocietyfoundations.org +bancsabadell.com +hola.com +life.ru +backlog.jp +tiffany.com +yuanzhanapp.com +uncc.edu +panasonic.jp +e-derslik.edu.az +brides.com +cootlogix.com +keepass.info +thenewslens.com +zaobao.com +marketsandmarkets.com +bcove.me +sky.it +ladepeche.fr +perfil.com +afterpay.com +rungrinh.vn +icij.org +jreast.co.jp +vingle.net +ucd.ie +teamusa.org +sncf.com +connatix.com +whowhatwear.com +voc.com.cn +businessoffashion.com +blackrock.com +google.co.ke +calibre-ebook.com +fisglobal.com +2o7.net +vokrug.tv +okp.com +flavors.me +plista.com +banglanews24.com +tradeindia.com +heapanalytics.com +freemusicarchive.org +jang.com.pk +pngegg.com +colossusssp.com +adhaven.com +ipsnews.net +qnap.com +moddb.com +insurancejournal.com +wfu.edu +247wallst.com +arabseed.cam +comingsoon.net +g2afse.com +jkanime.net +runescape.com +earthsky.org +wp-royal.com +sherwin-williams.com +wordcounter.net +hainan.gov.cn +todoist.com +paxful.com +score.org +selfgrowth.com +marketingprofs.com +umb.edu +ui.ac.id +unglobalcompact.org +newadvent.org +iit.edu +ninthdecimal.com +gitbook.io +wjla.com +ohio.edu +khou.com +crowdrise.com +anthem.com +cheshi.com +alipaydns.com +saglik.gov.tr +theadvocate.com +middlebury.edu +zoznam.sk +rawgit.com +vip.com +niemanlab.org +notebookcheck.net +ravelry.com +rcn.com +copart.com +google.com.ni +no-ip.com +sante.fr +activestate.com +ozerov.de +processon.com +u17.com +bluejeans.com +thenationalnews.com +enorth.com.cn +ncaa.com +citigroup.com +win-rar.com +toptenreviews.com +savethechildren.org +teamspeak.com +sammobile.com +muni.cz +marketwired.com +hotukdeals.com +womansday.com +adultswim.com +buff.ly +denofgeek.com +wimp.com +confex.com +und.edu +pcpop.com +thenewstribune.com +wiwo.de +home.cern +nongnu.org +youneedabudget.com +dayoo.com +myheritage.com +hsn.com +kxcdn.com +mhtwyat.com +ufrgs.br +duapps.com +bastillepost.com +toysrus.com +goop.com +headlines.pw +google.mu +17173.com +wenthemes.com +airtm.com +opm.gov +sentry-cdn.com +moneycrashers.com +gapyear.com +llbean.com +google.com.om +property24.com +imagetwist.com +tv.com +kingcounty.gov +bunshun.jp +pravo.gov.ru +cnhubei.com +sony.jp +carsales.com.au +colourpop.com +uwyo.edu +msdmanuals.com +kentucky.com +michelejullian.info +digitalguardian.com +efe.com +pozdravok.ru +ubergizmo.com +sasac.gov.cn +theporndude.com +majhinaukri.in +moodys.com +fitgirl-repacks.site +cnfol.com +cloudflare-dns.com +wpi.edu +royalsociety.org +computerhistory.org +hdslb.com +buymeacoffee.com +sachsen.de +samsungiotcloud.com +coschedule.com +pinterest.it +narrative.io +skladchik.com +www.ne.jp +paperwritings.com +lifehacker.ru +kantei.go.jp +bitfinex.com +shopee.com +realmadrid.com +marketscreener.com +myschoolapp.com +racked.com +dunyanews.tv +angelbroking.com +jcb.co.jp +sho.com +hktdc.com +dochub.com +threatpost.com +rtl.fr +6abc.com +meredith.com +divx.com +sbis.ru +openbsd.org +london.gov.uk +worldofwarships.eu +stjude.org +ge.tt +breakflip.com +thestate.com +palgrave.com +opensubtitles.org +bulbagarden.net +artic.edu +vector.co.jp +coach.com +gadgetsnow.com +aao.org +guim.co.uk +tn.edu.tw +theinformation.com +caixin.com +residentadvisor.net +droom.in +womenshealth.gov +bravenet.com +spbu.ru +duga.jp +50webs.com +carousell.sg +bitflyer.com +geogebra.org +griffith.edu.au +destatis.de +pravda.com.ua +regjeringen.no +internet.com +createsend1.com +uct.ac.za +ulifestyle.com.hk +312168.com +continental.com +gohugo.io +statuspage.io +ifvod.tv +16mb.com +biteable.com +buzznet.com +dwz.cn +harpers.org +zalando.de +npc.gov.cn +airbnb.ca +lesoir.be +sohatv.vn +manualslib.com +rada.gov.ua +umaryland.edu +uscg.mil +6789.com +reallifecam.com +news247.gr +qafqazinfo.az +himado.in +lboro.ac.uk +businesstoday.in +citibank.co.in +radissonblu.com +creditmutuel.fr +cookiebot.com +vfsglobal.com +amemv.com +onepeloton.com +nationwide.com +thisoldhouse.com +pornpics.com +khabarpu.com +mobypicture.com +hathitrust.org +traveloka.com +nestle.com +crypto.com +google.ge +vevo.com +refersion.com +wgntv.com +btcliving.com +seznamzpravy.cz +icptrack.com +w3techs.com +jneurosci.org +tulsaworld.com +vpnmentor.com +datacamp.com +rijksmuseum.nl +logitechg.com +digitalcommerce360.com +cambridgeenglish.org +codedexchange.com +americanthinker.com +ofcom.org.uk +buddypress.org +rotary.org +nova.edu +espncdn.com +pe.com +safer-networking.org +mbalib.com +aaos.org +biblestudytools.com +realtime-bid.com +fontanka.ru +powerofpositivity.com +lolipop.jp +ethnos.gr +justanswer.com +x2convert.com +kunlunsl.com +couponfollow.com +jagodangdut.com +googlehosted.com +jobkorea.co.kr +brainly.com.br +jianzhi8.com +mailjet.com +myperfect2give.com +abs.gov.au +asahi-net.or.jp +princetonreview.com +afthemes.com +everestads.net +okdiario.com +ipko.pl +pons.com +webgarden.cz +110mb.com +domaining.com +dahe.cn +offcn.com +secnews.gr +sciam.com +chinamobile.com +tizianafausti.com +boltdns.net +beliefnet.com +nbcsandiego.com +ucoz.net +flannels.com +dmcdn.net +listverse.com +jdsupra.com +airtelxstream.in +dccomics.com +livechat.com +env.go.jp +cnn.io +consequenceofsound.net +simplyrecipes.com +cjn.cn +hessen.de +jutarnji.hr +amcdn.vn +gitbook.com +carleton.edu +latribune.fr +lexilogos.com +telegram.com +riafan.ru +bunnings.com.au +f95zone.to +uptimerobot.com +fco.gov.uk +semana.com +xzjdjx.com +uqam.ca +pc6.com +online-audio-converter.com +psychiatry.org +udmserve.net +diandongwajueji.com +ulb.ac.be +a.co +dtscdn.com +linternaute.fr +openid.net +wpu.sh +yt1s.com +bgsu.edu +google.com.pr +publix.com +wikidata.org +distrokid.com +rpp.pe +bobvila.com +thesmokinggun.com +ansi.org +delfi.ee +postaffiliatepro.com +apc.com +conferdeploy.net +vh1.com +exxonmobil.com +biggerpockets.com +yogajournal.com +regions.com +atmarkit.co.jp +click2houston.com +unwto.org +pornone.com +blacklivesmatter.com +filmweb.pl +abercrombie.com +pcisecuritystandards.org +muji.com +franceinter.fr +hdsex.org +ticksy.com +hasil.gov.my +lfstmedia.com +weblate.org +deccanchronicle.com +majestic.com +thinkupthemes.com +pglstatp-toutiao.com +clickondetroit.com +120ask.com +amara.org +presseportal.de +gizmodo.com.au +ellitoral.com +capital.gr +unipi.it +yes24.com +zergnet.com +nex8.net +ua.es +contineljs.com +yourtango.com +2mnd56.com +privacypolicygenerator.info +jmu.edu +marktplaats.nl +bytedance.net +musescore.com +zoosk.com +haqqin.az +clan.su +fcbarcelona.com +mtime.com +businesscatalyst.com +ipt.pw +soccerway.com +ondemand.com +up.nic.in +verizon.net +dynatrace.com +egov.kz +fanduel.com +santander.com.br +storenvy.com +rbi.org.in +terkini.id +wuxiaworld.com +discord.media +wfaa.com +yam.com +pp.ru +gov.kr +yourdailysportfix.com +bama.ir +google.hn +diepresse.com +fullerton.edu +brightlocal.com +omnicalculator.com +easygetinsta.com +alanba.com.kw +googleapis.cn +jaxa.jp +wefinex.net +abc15.com +taipeitimes.com +lycos.fr +cineca.it +miaminewtimes.com +ipsosinteractive.com +id.me +google.mn +edelman.com +uni-bremen.de +osaka-u.ac.jp +investorplace.com +aihelp.net +a3cloud.net +gmpg.org +google.cd +whereby.com +loreal.com +okala.com +21food.cn +kdvr.com +samsungrs.com +wwnorton.com +instamojo.com +dropboxstatic.com +shiftdelete.net +walkerland.com.tw +ns1p.net +flvto.biz +bancogalicia.com.ar +saat24.news +dlr.de +next.co.uk +basspro.com +cuntempire.com +zcubes.com +azureedge.us +sportradarserving.com +betway.com +ca.com +google.ci +qhdsny.com +ms.gov +ilna.news +internationalwomensday.com +joelonsoftware.com +jacksonville.com +iab.com +adultfriendfinder.com +newswire.com +ssjzw.com +metlife.com +todo1.com +wondershare.net +b92.net +c-msedge.net +transifex.com +escort-advisor.com +openvpn.net +dns-shop.ru +solarwinds.com +twitcasting.tv +escholarship.org +winnipegfreepress.com +cnnturk.com +simon.com +watanserb.com +servimg.com +brew.sh +booklikes.com +ca168.com +themefreesia.com +smarttradecoin.com +x-mol.com +utarget.ru +iq.com +brisbanetimes.com.au +shaadi.com +kdocs.cn +unilever.com +nerdist.com +10086.cn +firefox.com.cn +wowslider.com +kampyle.com +it1352.com +jacobinmag.com +state.nj.us +memrise.com +iod2.cn +umaine.edu +qhyhgf.com +lwn.net +699pic.com +globalpost.com +getepic.com +mp3-youtube.download +samba.org +bjnews.com.cn +imo.org +dbs.com +mxhichina.com +mediapart.fr +socialbakers.com +mashreghnews.ir +sulekha.com +nebraska.gov +eldorado.ru +getfirefox.com +treas.gov +google.li +videohive.net +nexage.com +inosmi.ru +bget.ru +sainsburys.co.uk +upornia.com +scoop.co.nz +al3omk.com +zencdn.net +wetpaint.com +viifan.com +spectrumlocalnews.com +subhd.com +solidworks.com +csdiran.ir +arukikata.co.jp +1drv.com +adthrive.com +vonage.net +igg.me +joebiden.com +zynga.com +rospotrebnadzor.ru +weeklystandard.com +fzg360.com +simplisafe.com +babble.com +share-videos.se +rainn.org +veeam.com +gurufocus.com +motherless.com +blockchain.info +miamioh.edu +itar-tass.com +pec.it +ca800.com +screener.in +sd.gov +strongest.cn +nflbite.com +playvalorant.com +containerstore.com +hearthis.at +excelsior.com.mx +tagesanzeiger.ch +leidenuniv.nl +kicker.de +powerthesaurus.org +cudasvc.com +nimo.tv +valentino.com +woshipm.com +google.am +cyberciti.biz +mrtnsvr.com +thinkquest.org +oprahmag.com +webaim.org +ucdenver.edu +liverpool.ac.uk +macrotrends.net +vudu.com +pcloud.com +10fastfingers.com +animenewsnetwork.com +athome.co.jp +iis.net +nascar.com +carrd.co +user-shield.com +nea.org +royalgazette.com +bravehost.com +coding.net +redcrossblood.org +amnestyusa.org +christianlouboutin.com +washingtonian.com +hs.fi +geo.tv +center4family.com +batmanapollo.ru +statisticbrain.com +cervantes.es +amcs-tachyon.com +tubesafari.com +symbaloo.com +clutch.co +mta.info +jianzhiba.net +jocial.com +prv.pl +3dn.ru +playfabapi.com +rolex.com +amazon.sg +icao.int +allposters.com +siteimproveanalytics.io +bendibao.com +educity.cn +diva-portal.org +azurefd.us +dwell.com +whattoexpect.com +dlive.tv +opb.org +siteswithcontent.com +analdin.com +roche.com +famitsu.com +housing.com +soha.vn +maxmind.com +google.com.af +tongji.edu.cn +ipinfo.io +lds.org +google.is +atw.hu +inoreader.com +budsgunshop.com +sketch.com +goodreturns.in +porntrex.com +bmwi.de +cafis-paynet.jp +chowhound.com +banesco.com +akahost.net +agpd.es +sensortower.com +grc.com +loex.io +jdpower.com +empireonline.com +sdu.edu.cn +uni-trier.de +grandviewresearch.com +wealthsimple.com +thisiscolossal.com +netacad.com +thelocal.de +4anime.to +jyb.cn +prz.edu.pl +kbstar.com +mdanderson.org +yaklass.ru +cnpq.br +rus.ec +seoccc.com +pornhat.com +companieshouse.gov.uk +bumble.com +fonts.com +almaghreb24.com +format.com +austinchronicle.com +hikvision.com +ruvr.ru +douyinvod.com +euobserver.com +img-taboola.com +kmail-lists.com +thepresident.gr +telesurtv.net +blossomthemes.com +lrb.co.uk +pearsoncmg.com +strath.ac.uk +nwps.ws +postman.com +fourmilab.ch +commsec.com.au +jamaran.news +advocate.com +href.li +softwareadvice.com +petsmart.com +radiko.jp +kickassanime.rs +monex.co.jp +societegenerale.fr +index.hu +hzmklvdieo.com +laobiao.com +opengroup.org +cbsistatic.com +zum.com +blogactiv.eu +blondieshop.com +nationalgallery.org.uk +twitlonger.com +coinpot.co +clipconverter.cc +amazon.com.tr +clicrbs.com.br +yandex.ua +guru.com +worldsecuresystems.com +flywheelsites.com +nextcloud.com +skycn.com +juniper.net +lindaikejisblog.com +a1sewcraft.com +portableapps.com +walmart.com.mx +hypermusk.com +uisdc.com +typosthes.gr +tubitv.com +fxstreet.com +ebc.com.br +skift.com +daad.de +niniban.com +pipedrive.com +getpaint.net +slack-redir.net +jiji.ng +falabella.com +baixing.com +webteb.com +exness.com +todayonline.com +fabric.com +groupspaces.com +privacypolicyonline.com +franchemduty.work +gitv.tv +wallstreetforum.net +barackobama.com +stihi.ru +optinmonster.com +funimation.com +thimpress.com +google.com.mt +uni-due.de +saatchiart.com +geocities.co.jp +trendarbitrage.com +deployads.com +flipgrid.com +restream.io +gta5-mods.com +suse.com +superpages.com +guardian.ng +metafilter.com +local.com +sandisk.com +google.ht +authy.com +thenationonlineng.net +uab.cat +linestep.net +blueyonder.co.uk +yourbrideglobal.com +usfca.edu +vidyard.com +cdn77.org +qub.ac.uk +morgenpost.de +bih.nic.in +split.io +indiatvnews.com +blogspot.mx +city.ac.uk +theqoo.net +intercontinental.com +more.tv +thewire.in +b-msedge.net +angular.io +3dcartstores.com +pornsos.com +picsart.com +aph.gov.au +sbipg.sbi +nearme.com.cn +sae.org +lieferando.de +shopclues.com +algoritmika.az +democratandchronicle.com +samedayessay.com +bizport.cn +yinxiang.com +bancochile.cl +yankodesign.com +hapitas.jp +epo.org +nitroflare.com +freebeacon.com +webpagetest.org +livecareer.com +yatra.com +youme.im +tvbs.com.tw +pizzahut.com +blogspot.ch +iberia.com +iconosquare.com +ttlink.com +governmentjobs.com +elte.hu +mediacdn.vn +panerabread.com +asda.com +hotnewhiphop.com +moa.gov.cn +sse.com.cn +google.as +ebaydesc.com +henan.gov.cn +bravotube.net +58pic.com +jobcan.jp +liverpoolfc.com +nick.com +network-auth.com +usccb.org +unina.it +alisoft.com +ilgiornale.it +tn.gov.in +trekbikes.com +aamc.org +woolworths.com.au +adsco.re +krone.at +publico.es +bbva.com.ar +mojang.com +clegc-gckey.gc.ca +4movierulz.lv +ozbargain.com.au +300.cn +odin.com +chicago.gov +avature.net +bet.com +easywp.com +culture.ru +zoho.in +therealdeal.com +theonlygames.com +metacpan.org +ens.fr +metapress.com +cssmoban.com +zapps.vn +fajar.co.id +staradvertiser.com +autozone.com +uni-wuerzburg.de +knoji.com +mootools.net +mobalytics.gg +peopleperhour.com +uberant.com +qeqeqe.com +qhnky.com +kinokong.org +2checkout.com +tp.edu.tw +fender.com +smith.edu +up.gov.in +anyflip.com +blogspot.gr +pdfescape.com +a16z.com +thrivecart.com +alc.co.jp +iihs.org +cpta.com.cn +labiennale.org +panet.co.il +1gb.ru +agu.org +ebaystatic.com +bitauto.com +turbosquid.com +home.neustar +collegeboard.com +txstate.edu +mondiad.net +cac.gov.cn +ielts.org +binged.it +seths.blog +slideplayer.com +donorbox.org +aon.com +newspicks.com +moveon.org +polyu.edu.hk +tympanus.net +zhangyu.tv +lacity.org +rebrandly.com +gfx.ms +qstheory.cn +bbva.mx +learningapps.org +decider.com +myrussianbride.net +google.la +fosshub.com +find-a-bride.net +klaviyo.com +nvsp.in +unirioja.es +pilotonline.com +userapi.com +dressinn.com +vonage.com +morguefile.com +carecredit.com +adop.cc +app.com +gem.gov.in +zhuwang.cc +comparitech.com +standaard.be +affirm.com +davivienda.com +crello.com +sudouest.fr +travelerdoor.com +destructoid.com +amadeus.com +megaphone.fm +xhamsterpremium.com +nursery.com.pk +personalbadcreditloans.net +thestranger.com +bytefence.com +aternos.org +google.ad +book118.com +qiwi.com +mandrillapp.com +videopress.com +cbr.ru +firstcry.com +vodafone.com +igodigital.com +m1finance.com +ucpress.edu +braintreegateway.com +adventori.com +check24.de +ultraviewer.net +placed.com +theprint.in +pojoksatu.id +jewishvirtuallibrary.org +standardchartered.com +google.com.pa +creditcards.com +hellofresh.com +wolterskluwer.com +internetsociety.org +bigmir.net +tripadvisor.es +careers360.com +google.co.mz +google.com.gh +linuxjournal.com +sqworl.com +clickorlando.com +la-croix.com +nanowrimo.org +vzw.com +carbonblack.io +israelnationalnews.com +beeline.ru +elledecor.com +deere.com +key.com +babylon.com +xmu.edu.cn +runsignup.com +joinhandshake.com +eztv.re +ntnu.edu.tw +apkmirror.com +google.co.bw +perm.ru +pandora.net +goindigo.in +tdatamaster.com +cex.io +cubadebate.cu +elvenar.com +beget.tech +hipwee.com +ksu.edu +jbhifi.com.au +shandong.gov.cn +ffmpeg.org +haiwainet.cn +urssaf.fr +grafthivecrocus.cam +problogger.com +google.com.cy +telangana.gov.in +uwindsor.ca +openrice.com +jcp.org +francebleu.fr +scotusblog.com +zoopla.co.uk +globalcitizen.org +wpcomstaging.com +ekstrabladet.dk +videoamp.com +marketo.net +stylecaster.com +eventbrite.com.au +homeaway.com +noor-book.com +pbebank.com +photoshop.com +adsmoloco.com +freshworks.com +salute.gov.it +stocksnap.io +hunter.io +tv2.no +sundance.org +ucc.ie +geni.us +elkhabar.com +en-japan.com +life360.com +jetro.go.jp +chuandong.com +hyperallergic.com +brightmountainmedia.com +newsbomb.gr +bmwgroup.com +webdesignerdepot.com +researchandmarkets.com +sumibuy.com +google.co.ug +plarium.com +greenbiz.com +lianlianpay.com +skimlinks.com +uptvs.com +openweathermap.org +futureplc.com +fee.org +jekyllrb.com +edf.org +blinklist.com +reliancedigital.in +sacred-texts.com +jomodns.com +y-medialink.com +gsma.com +xspdf.com +godaddysites.com +oliveogrill.com +praca.gov.pl +auone.jp +angularjs.org +imgbox.com +compressjpeg.com +capitaloneshopping.com +aramex.com +techwalla.com +acmethemes.com +shop.app +growingio.com +lankasri.com +bancobai.ao +google.al +qr-code-generator.com +citizensadvice.org.uk +labcorp.com +xinnet.com +seroundtable.com +opswat.com +kennedy-center.org +mediatemple.net +sicredi.com.br +elgenero.com +wesleyan.edu +kiro7.com +autocar.co.uk +awardspace.com +datingreviewer.net +middleeasteye.net +dundee.ac.uk +west.cn +hs-sites.com +lbpicmt.com +guitarcenter.com +castbox.fm +iwm.org.uk +opera-api.com +naftemporiki.gr +environment.gov.au +filefactory.com +charter.net +jobsdb.com +piwik.org +doctissimo.fr +le.com +knet.cn +webcrawler.com +elbilad.net +umm.edu +anonymize.com +uca.fr +1717pk.com +dcfever.com +rawpixel.com +yourdomain.com +ulg.ac.be +bootstrapmade.com +turnkeylinux.org +translatewiki.net +file-upload.com +bnmla.com +nxtbook.com +ok.xxx +ludashi.com +e-gov.az +preply.com +on.cc +prosieben.de +google.bs +studopedia.ru +cfainstitute.org +westword.com +multitran.com +trip.com +mbank.pl +uni-kiel.de +doingbusiness.org +hdzog.com +joann.com +591.com.tw +ultimatix.net +mentalhealth.org.uk +sucursalelectronica.com +dti.ne.jp +travis-ci.org +dea.gov +cs-cart.com +rsna.org +oann.com +asha.org +wko.at +mmo-champion.com +akwam.co +shine.com +cqnews.net +cnki.com.cn +hatenablog.jp +hypotheses.org +papernow.org +phoenix.edu +cnipa.gov.cn +gimy.co +chefkoch.de +ufc.com +gumtree.co.za +ki.se +ekantipur.com +metinfo.cn +vk.me +lordfilm.so +westlaw.com +ndl.go.jp +oxfordlearnersdictionaries.com +japan-guide.com +tstatic.net +nationaljournal.com +surfline.com +fr.de +namequery.com +probuilds.net +richmond.edu +ihs.com +caijing.com.cn +xiachufang.com +accesstrade.net +passkey.com +hardrock.com +icook.tw +serif.com +artfire.com +google.sn +eyereturn.com +atimes.com +register.com +bmw.com +google.mg +informit.com +marquette.edu +netund.com +gamepass.com +lush.com +jigsawplanet.com +wayfair.ca +teamtreehouse.com +ksapisrv.com +aktuality.sk +muenchen.de +franceculture.fr +alibabacloud.com +sciencemuseum.org.uk +adpone.com +tmweb.ru +truepush.com +admixer.net +cqvip.com +perimeterx.net +360buyimg.com +uibk.ac.at +odn.ne.jp +slu.edu +steelseries.com +native-instruments.com +couriermail.com.au +santander.cl +transfermarkt.de +yunexpress.com +jobui.com +51ade.com +leanplum.com +sefon.pro +dartsearch.net +dailyhunt.in +espacenet.com +wellandgood.com +cbpp.org +1001freefonts.com +abqjournal.com +celtra.com +thefreelibrary.com +egov-nsdl.com +fy169.net +alturl.com +gomlab.com +echosign.com +livenation.com +theplatform.com +datingstudio.com +teamwork.com +loccitane.com +google.com.jm +brunch.co.kr +vimeopro.com +termly.io +assemblee-nationale.fr +ricardo.ch +aol.co.uk +sportsmansoutdoorsuperstore.com +zakupki.gov.ru +gg.gg +qatarliving.com +china360.cn +alquds.co.uk +tigerdirect.com +seeking.com +thehotline.org +telus.com +genome.gov +melia.com +trafficjunky.net +sdo.com +limetorrents.info +soufun.com +comedycentral.com +widespace.com +cgtrader.com +stackpathdns.com +toledoblade.com +yn.gov.cn +hsbc.co.uk +hubapi.com +zurb.com +quicinc.com +jabra.com +instapage.com +wenxuecity.com +tvrain.ru +einnews.com +heytapimage.com +stereogum.com +moj.go.jp +steepto.com +kariyer.net +dkb.de +myfolio.com +immi.gov.au +cloudscar.com +gongkong.com +nianticlabs.com +pikbest.com +discourse.org +crsky.com +acehardware.com +rhs.org.uk +justin.tv +lagou.com +essence.com +chinacourt.org +pclady.com.cn +wdl.org +kw.com +google.bi +betweendigital.com +toronto.edu +labnol.org +synology.me +syri.net +cpj.org +yieldlab.net +hclips.com +eero.com +chimpstatic.com +marketingweek.com +online2pdf.com +meitu.com +agilent.com +time.is +proximabeta.com +usertesting.com +mandarinoriental.com +maff.go.jp +mskcc.org +tinypass.com +allbusiness.com +williamhill.com +essex.ac.uk +tass.com +comenity.net +verydesigner.cn +groww.in +elo7.com.br +blic.rs +uoa.gr +pushmart.net +hugoboss.com +ncore.cc +manhuagui.com +internic.net +arsenal.com +fashionnova.com +mondaq.com +pge.com +greentechmedia.com +questdiagnostics.com +ncjrs.gov +zblogcn.com +themeansar.com +ngs.ru +kansas.com +brandpa.com +insee.fr +gizmochina.com +answerthepublic.com +yandex.com.tr +sbs.co.kr +megafon.ru +vg247.com +grailed.com +everquote.com +sc.gov.cn +defra.gov.uk +exhentai.org +thewindowsclub.com +getflywheel.com +dynadot.com +zuimeitianqi.com +backcountry.com +advfn.com +bucknell.edu +entekhab.ir +kgw.com +jomashop.com +khan.co.kr +pdf2doc.com +poly.com +monografias.com +tekcities.com +fastcoexist.com +smrtb.com +discoveryeducation.com +helpscout.com +athenahealth.com +paper-helper.org +wenjuan.com +google.md +adsoftheworld.com +webengage.com +telmex.com +tv5monde.com +jp.net +izvestia.ru +google.mk +n4g.com +json.org +lancaster.ac.uk +mihanwebhost.com +uhaul.com +starfall.com +mathway.com +proprofs.com +nationalmssociety.org +financesonline.com +smotrim.ru +journalism.org +hometalk.com +owncloud.com +appsumo.com +sxc.hu +modernhealthcare.com +voxeu.org +pornolab.net +w7000.com +zaycev.net +habrahabr.ru +spot.im +lavozdegalicia.es +feng.com +ametsoc.org +google.com.bn +osichem001.com +goo-net.com +governing.com +fox8.com +getdrip.com +coltortiboutique.com +indusind.com +cdek.ru +lamoda.ru +cbd.int +shalltry.com +astrazeneca.com +intesasanpaolo.com +gameanalytics.com +babyshop.com +jenkins.io +unibas.ch +stylemixthemes.com +iponweb.net +wgbh.org +money.pl +tpsl-india.in +mypearson.com +maven.org +yjtag.jp +islcollective.com +metrolyrics.com +webkit.org +jd.hk +hmrc.gov.uk +dailystrength.org +ajio.com +newchic.com +playbill.com +haoyangmao8.com +handbrake.fr +ohsu.edu +siftscience.com +omnihotels.com +uline.com +zenhabits.net +fc.lc +educacao.mg.gov.br +name.com +squadhelp.com +google.com.na +reebok.com +oray.com +lectortmo.com +owncloud.org +alison.com +xenforo.com +gamespy.com +doi.gov +boisestate.edu +google.ps +ascii.jp +ponisha.ir +your-server.de +desjardins.com +iltalehti.fi +blogblog.com +tvline.com +gcs-web.com +freakonomics.com +aktualne.cz +maidi.me +wallpaperaccess.com +videvo.net +massgeneral.org +codinghorror.com +cebbank.com +jumpcloud.com +youcaring.com +nexac.com +wpzoom.com +dsw.com +mforos.com +gcloudsdk.com +blogcms.jp +swiftserve.com +alignable.com +backblaze.com +aso1.net +gametrailers.com +clicktale.net +tripadvisor.it +gunbroker.com +thebodyshop.com +ovid.com +besthookupwebsites.net +ylsw.com +skyroom.online +tu-darmstadt.de +zonealarm.com +ijie.com +research.net +unil.ch +nikonusa.com +hobbs.com +customink.com +takungpao.com +eleconomista.com.mx +seneweb.com +businesstimes.com.sg +ugm.ac.id +flyertea.com +4dex.io +nctu.edu.tw +e1.ru +moncler.com +gamingbible.co.uk +essaywriting.org +laughingsquid.com +enotes.com +netapp.com +uba.ar +sytes.net +gansu.gov.cn +oakley.com +barcelona.cat +olemiss.edu +publons.com +flexmls.com +clickmeeting.com +aphapublications.org +onelink.to +sakura.ad.jp +local10.com +udayton.edu +svd.se +avastbrowser.com +australia.com +shopifycloud.com +winshang.com +amazon.nl +xxinn887.com +lci.fr +kaist.ac.kr +90tiyu.com +chem17.com +thedailymeal.com +disney.co.jp +wizzair.com +sam.gov +jobrapido.com +mediawallahscript.com +journaldesfemmes.fr +aeroflot.ru +tb.cn +ufmg.br +museodelprado.es +bathandbodyworks.com +utsa.edu +gomhuriaonline.com +library.lol +unibet.com +warbyparker.com +dogpile.com +dailysignal.com +daytondailynews.com +qiku.com +car.gr +ycwb.com +uni-lj.si +stamped.io +bmwusa.com +toshiba.co.jp +malaymail.com +italki.com +trend.az +tirexo.io +art.com +ant.design +typing.com +google.com.ai +blogspot.pt +wickedlocal.com +gov.on.ca +juicyads.com +meeshosupply.com +1stdibs.com +rts.ch +technion.ac.il +ah.gov.cn +adbro.me +everestjs.net +sou300.com +gelocal.it +ferret-plus.com +cpubenchmark.net +titlemax.us +linkhaitao.com +blogspot.com.ar +carvana.com +ng.ru +teepublic.com +digital-photography-school.com +joox.com +yourwownews.com +google.com.ag +7news.com.au +razerzone.com +baixaki.com.br +kcrw.com +sogoucdn.com +bybit.com +eurosport.com +liv.ac.uk +720pizle.org +niedersachsen.de +hawaiinewsnow.com +mercadolivre.com +zdusercontent.com +pillpack.com +woman.ru +iii.org +lifesitenews.com +latam.com +fashionista.com +byrenjia.com +walkscore.com +finder.com +sgi.com +papajohns.com +firebaseapp.com +delawareonline.com +ufreegames.com +gingersoftware.com +bitmex.com +journalstar.com +moj.gov.cn +shaanxi.gov.cn +webgarden.com +myfxbook.com +ptc.com +wpmudev.org +oceanwp.org +leiphone.com +uncommongoods.com +ecnu.edu.cn +hiworks.com +dapenti.com +rense.com +127.net +katu.com +besoccer.com +hinative.com +pitchbook.com +waterstones.com +monsterindia.com +iauec.ac.ir +pushapi.online +dcloud.net.cn +farnell.com +naacp.org +yayoi-kk.co.jp +qualaroo.com +chabad.org +blog.gov.uk +latrobe.edu.au +nysed.gov +liquidweb.com +google.co.uz +kurier.at +jidapharm.com +mywebsite-editor.com +benchmarkemail.com +ariba.com +al-monitor.com +webd.pl +oa.com +splunk.com +alphacoders.com +fec.gov +huaxingchem.com +sqwyw.org +uc.pt +retaildive.com +washingtonmonthly.com +elearningindustry.com +artisteer.com +hookupwebsites.org +apptimize.com +kxan.com +nnov.ru +doramasmp4.com +realmailorderbrides.com +adtilt.com +guru3d.com +art19.com +noip.com +admedo.com +techbang.com +wetv.vip +btinternet.com +karnataka.gov.in +korrespondent.net +postfun.com +bluewin.ch +shangxueba.com +mailorderbrides.us +applvn.com +docstoc.com +jarir.com +scpr.org +accountkit.com +sharedid.org +duowan.com +elifesciences.org +xnxx-cdn.com +3dnews.ru +submittable.com +kempinski.com +hangzhou.com.cn +vbulletin.com +jining.com +snhu.edu +rsa.com +phillymag.com +getpostman.com +gfk.com +daraz.com.bd +converse.com +1sept.ru +cryptobrowser.site +23hq.com +squareblogs.net +oui.sncf +voz.vn +gmx.com +publico.pt +flexera.com +115.com +americanbanker.com +google.tt +theme.co +pulitzer.org +coolmathgames.com +8684.cn +geoguessr.com +google.co.zm +thegospelcoalition.org +mt.co.kr +dergipark.org.tr +sitey.me +chrome.com +americanheart.org +xbytessolucoes.com +gradeup.co +tv9kannada.com +telegraphindia.com +javhd.com +ghacks.net +essay-company.com +google.bf +radiofarda.com +perfectgirls.net +zapmeta.ws +woobox.com +vans.com +pressdemocrat.com +auto.ru +boxcdn.net +airbnb.co.uk +google.fm +mellowads.com +shopifycdn.com +doctorswithoutborders.org +m-team.cc +nissanusa.com +newwife.net +leju.com +careeronestop.org +photon.com +decipherinc.com +bbva.es +cointiply.com +gold678.com +alahlionline.com +streameast.live +visualwebsiteoptimizer.com +iwanttodeliver.com +tilda.cc +hotrussianwomen.net +itproportal.com +bigstockphoto.com +yougov.co.uk +revolut.com +jamesclear.com +careerpower.in +thewaltdisneycompany.com +otago.ac.nz +highcharts.com +webestools.com +grooveshark.com +themarysue.com +montiboutique.com +mcafeewebadvisor.com +z.com +visitlondon.com +londonstockexchange.com +google.tm +kazeo.com +orst.edu +thrivethemes.com +jstv.com +350.org +mturk.com +majorgeeks.com +xunta.gal +fgv.br +russianfood.com +1c.ru +uberinternal.com +overcast.fm +9ku.com +digitec.ch +goskope.com +ap7am.com +radaronline.com +phone.com +socialnewpages.com +southmoney.com +1c-bitrix.ru +weizmann.ac.il +google.cg +creativelive.com +rescuetime.com +tv2.dk +shaolianhu.com +sportsmole.co.uk +cdnico.net +hero-wars.com +eljur.ru +uni-erlangen.de +barclaycardus.com +uimaker.com +summerhamster.com +chng.it +sanwen8.cn +ename.net +cash-central.net +doostihaa.com +plus.com +sigmaaldrich.com +etoland.co.kr +fossbytes.com +runative-syndicate.com +fstoppers.com +bna.com +google.co.ck +level3.com +google.dm +to10.gr +leam.com +bjs.gov +pingdom.net +privy.com +realtor.org +googlesource.com +byrdie.com +ignou.ac.in +chapman.edu +klook.com +rtings.com +intercom.help +mycdn.me +robbreport.com +xmsecu.net +info.com +usal.es +emerse.com +upmusics.com +bookshop.org +earthcam.com +weddingbee.com +the-sun.com +starfieldtech.com +vresp.com +courtlistener.com +vub.ac.be +cashnetusaapplynow.com +otnolatrnup.com +patient.info +google.com.bz +indeedassessments.com +fieldengineer.com +timeoutcn.com +moondoge.co.in +startappservice.com +autoevolution.com +ogunhaber.com +bitdefender.net +jibjab.com +clideo.com +trenitalia.com +tmall.hk +hokudai.ac.jp +channelmyanmar.org +dbnnmmxo.com +royalcaribbean.com +digitimes.com +vwg-connect.cn +besthookupwebsites.org +money.com +samsungotn.net +nation.co.ke +tureng.com +appstate.edu +comptia.org +checkout.com +timesfreepress.com +uns.ac.id +audi.com +senat.fr +escapistmagazine.com +comdirect.de +newssc.org +cib.com.cn +marieclaire.com.tw +mca.gov.in +biligame.com +ktvu.com +pinterest.ch +gestyy.com +hamyarwp.com +desire2learn.com +lohud.com +online.de +eiseverywhere.com +rocketnews24.com +wordle.net +opentext.com +xtemos.com +lmgtfy.app +wwu.edu +google.vg +boursorama.com +katmoviehd.se +tuiusuoxue.com +merkur.de +lawtime.cn +arabianbusiness.com +quantummetric.com +thesimpledollar.com +workplace.com +magentocommerce.com +visitscotland.com +rover.com +theamericanconservative.com +providesupport.com +fixya.com +bpb.de +easychair.org +expertpaperwriter.com +cnr.cn +blog.hu +brynmawr.edu +manual.canon +mysynchrony.com +sci-hub.do +in.com +rusprofile.ru +usm.edu +gcu.edu +gopejk.com +vagina.nl +texasmonthly.com +point2homes.com +wikia.org +classiccars.com +osticket.com +optimole.com +lancs.ac.uk +ziraatbank.com.tr +tnaflix.com +colgate.com +yelp.ca +sgs.com +cat.com +wpfr.net +martindale.com +gangde.net +globe.com.ph +kobobooks.com +cryptocompare.com +dowjones.com +reedsy.com +cnstock.com +asiae.co.kr +ntv.io +coinsbit.io +akhbarona.com +seo.com +google.com.gi +providencejournal.com +movistarplus.es +pib.gov.in +pipex.com +bosch.com +admob.com +leukemiatwinklesagacious.com +instrument.com.cn +hotwire.com +wcvb.com +dotesports.com +binary.com +sixflags.com +newduba.cn +pxhere.com +convinceandconvert.com +google.kg +elnabaa.net +tlauncher.org +adobesc.com +kfw.de +weedmaps.com +alphonso.tv +canberratimes.com.au +jumia.ma +harveynichols.com +genieesspv.jp +qconcursos.com +google.com.fj +iisd.org +governo.it +dspunion.com +skyscanner.com +gazette.com +simplypsychology.org +google.sh +traderjoes.com +vseinstrumenti.ru +juksy.com +metopera.org +atlanticcouncil.org +36dm.com +nbcmiami.com +roberthalf.com +osf.io +google.rw +sbb.ch +taxheaven.gr +aptoide.com +carview.co.jp +google.co.tz +seemorgh.com +casadellibro.com +ravm.tv +tn.com.ar +canonical.com +nudevista.com +best2020-games-web1.com +digistore24.com +noxinfluencer.com +zqtk.net +armani.com +crossfit.com +netsolhost.com +uni-bielefeld.de +dailyfx.com +tmon.co.kr +microsoftstore.com +sellfy.com +hkex.com.hk +chick-fil-a.com +wechatos.net +dx.com +barstoolsports.com +brownsfashion.com +vpser.net +nuget.org +gotinder.com +distractify.com +ad-score.com +haber61.net +seventeen.com +carscoops.com +greasyfork.org +austlii.edu.au +scdn.vn +almalnews.com +tsn.ua +domain.name +spokeo.com +shipstation.com +tohoku.ac.jp +aparat.cam +chownow.com +myzaker.com +theladders.com +rosettastone.com +pixar.com +prettybrides.net +vix.com +thesundaytimes.co.uk +value-domain.com +cao.go.jp +unisa.edu.au +worldscientific.com +tessabit.com +start.me +groovefunnels.com +teratail.com +ya.ru +p-n.io +openai.com +ren.tv +splice.com +google.ws +zywjw.com +goldprice.org +countryattire.com +ygdy8.net +newser.com +kongfz.com +omicsonline.org +skidrowreloaded.com +appnext.com +littlecdn.com +undocs.org +chinabus.info +cmaj.ca +mb.com.ph +umms.org +zybang.com +songatak.vip +signonsandiego.com +minecraft-mp.com +qiniup.com +montereybayaquarium.org +dailygram.com +alza.cz +state.mi.us +ingramer.com +tuv.com +iktogo.com +k-msedge.net +gymshark.com +megamillions.com +motherearthnews.com +peraichi.com +lidingzhong.com +fqtag.com +revues.org +vnsmart.com.vn +regonline.com +os.tc +vw.com +popmech.ru +newsbeast.gr +placeit.net +privateinternetaccess.com +rebelmouse.com +eepw.com.cn +hao245.com +ocnk.net +physicsworld.com +zocdoc.com +newsis.com +cpuid.com +cash-advanceloan.net +carnegieendowment.org +cosmo.ru +getui.net +anquan.org +pirelli.com +subscribeonandroid.com +wunderlist.com +malaysiakini.com +utep.edu +balkanweb.com +islamweb.net +cartitleloans.biz +wgu.edu +india.gov.in +myasianbride.net +programmableweb.com +r18.com +maersk.com +chathamhouse.org +netease.im +interaction-design.org +legaldaily.com.cn +skai.gr +team-bhp.com +newcastle.edu.au +approved-cash.com +sponichi.co.jp +google.sc +england.nhs.uk +bytetcdn.com +citationmachine.net +payu.com +marketbeat.com +rivm.nl +freenode.net +bentley.com +xn--42c9bsq2d4f7a2a.com +associatedcontent.com +google.so +premier.one +rstudio.com +coinspot.com.au +kali.org +nt.gov.au +mass.edu +larazon.es +google.bt +yingjiesheng.com +shopstyle.co.uk +rfa.org +abcactionnews.com +sony.co.jp +nbabite.com +kogan.com +gdpr-info.eu +rtbsystem.org +censor.net +google.bj +ushistory.org +gvsu.edu +2ch.net +blogg.se +cs.com.cn +istruzione.it +icourse163.org +achilles-ena.com +rentalcars.com +okex.com +google.gg +csai.cn +stamps.com +theclutcher.com +tl88.net +diadona.id +openx.com +carsensor.net +libertaddigital.com +geocities.ws +bd-caict.com +familydoctor.org +visitsealife.com +cdmx.gob.mx +rips.icu +wpsmail.net +valassisdigital.io +nel.goog +bloomberg.co.jp +activemind.de +carto.com +nagoya-u.ac.jp +wp-events-plugin.com +kioerd.com +miamidade.gov +google.co.zw +mn.co +tvpixel.com +menards.com +hcaptcha.com +mtholyoke.edu +stdaily.com +moonbit.co.in +antena3.com +google.vu +fitchratings.com +blogher.com +post.ch +bps.org.uk +getdropbox.com +google.gl +volunteermatch.org +therealreal.com +wmagazine.com +ijinshan.com +ecu.edu +localsaver.com +ard.de +vinted.fr +doda.jp +awsapps.com +fashion-press.net +carbonite.com +gutefrage.net +blogspot.ro +kew.org +homeoffice.gov.uk +myus.com +snappfood.ir +hsadspixel.net +personalcapital.com +premiumtimesng.com +lotteon.com +ifc.org +fastspring.com +uncg.edu +umweltbundesamt.de +1and1-editor.com +doleta.gov +google.ml +tailorbrands.com +google.cv +dslreports.com +wemakeprice.com +simplywall.st +disp.cc +juicer.io +kiwibox.com +yhd.com +gulte.com +acsevents.org +brainly.lat +royalessays.co.uk +k2s.cc +filmfreeway.com +rp.pl +ne.gov +prodigygame.com +onthe.io +securedvisit.com +themailorderbride.com +redwap.me +chargebee.com +rtbme24.com +bershka.com +screamingfrog.co.uk +grants.gov +els-cdn.com +worldmarket.com +ilmessaggero.it +casasbahia.com.br +contentsquare.net +xmission.com +sciencing.com +google.dj +haberzamani.com +ppy.sh +sanjesh.org +canterbury.ac.nz +fitsmallbusiness.com +sass-lang.com +valvesoftware.com +pubhtml5.com +riverisland.com +samba.tv +r-ad.ne.jp +elite-brides.com +lidl.de +payumoney.com +abcya.com +redmine.org +google.com.pg +hemingwayapp.com +cookinglight.com +company.site +infineon.com +hanime.tv +podia.com +publicintegrity.org +cimbclicks.com.my +useit.com +bbvanet.com.mx +kontiki.com +edmontonjournal.com +google.mv +ura.news +cash4day.com +patrika.com +sinajs.cn +howtoforge.com +opoxv.com +crossref.org +footlocker.com +behindwoods.com +bloody-disgusting.com +qccoccocmedia.vn +abendblatt.de +bigbasket.com +themepalace.com +5ykj.com +siu.edu +worldpopulationreview.com +gepush.com +cashlandloans.net +cloudmobi.net +thetrevorproject.org +wlp-acs.com +tanja24.com +china-embassy.org +procon.org +ny1.com +indiafcdn.com +stylecraze.com +getadblock.com +etracker.de +liveworksheets.com +epizy.com +laracasts.com +euromonitor.com +bookcrossing.com +codal.ir +createsend.com +jl.gov.cn +ashemaletube.com +whitehatjr.com +gxnews.com.cn +wpml.org +cluodlfare.com +hpanalytics.net +pepperdine.edu +sunysb.edu +cibercuba.com +slader.com +uam.es +chemnet.com +ancient.eu +cookiedatabase.org +epic.org +skribbl.io +hdfcsec.com +v.gd +nettruyen.com +helixsleep.com +museivaticani.va +mediaindonesia.com +uoc.edu +oriflame.com +samcart.com +myinstallmentloans.net +creighton.edu +msocdn.com +nationbuilder.com +test.de +cb2.com +marchofdimes.org +ixxx.com +busuu.com +internetlivestats.com +howard.edu +ac-versailles.fr +joystiq.com +metrotimes.com +hudl.com +psbc.com +indodax.com +sonyericsson.com +sport24.gr +newsbreak.gr +calvin.edu +payeer.com +riverfronttimes.com +epower.cn +hotelscombined.com +famousbirthdays.com +360cities.net +krushmedia.com +aleteia.org +trade.gov +jiangxi.gov.cn +wpscdn.com +getapp.com +inflationbreedinghoax.com +kwsp.gov.my +biologists.org +xhamster4.com +bebo.com +transfermarkt.com +kontakt.az +google.ga +russianqupid.com +oreillynet.com +adelaidenow.com.au +samsungcloudcdn.com +floridatoday.com +asianetnews.com +openclipart.org +jc001.cn +uma.es +lexmark.com +nationmaster.com +tripadvisor.ru +siteimproveanalytics.com +peopleapp.com +unionbankonline.co.in +clickup.com +welivesecurity.com +aa.org +find-your-bride.com +adlooxtracking.com +pubgmobile.com +constitutioncenter.org +merckmanuals.com +360kuai.com +internetworldstats.com +vdo.ai +brides-to-be.com +marines.mil +eb.mil.br +php-fig.org +topix.com +asic.gov.au +jiosaavn.com +shopdisney.com +uakron.edu +bri.co.id +ripple.com +matadornetwork.com +lofter.com +london.edu +bubble.io +thequint.com +internetgundem.com +competethemes.com +whispersystems.org +datingrating.net +rspb.org.uk +google.je +pinknews.co.uk +whatwg.org +bignox.com +pangolin-sdk-toutiao.com +geology.com +backlog.com +abstractfonts.com +forlumineontor.com +odysee.com +istv.com.cn +pantheon.io +alwatanvoice.com +google.ms +solidot.org +i-ready.com +emlakgundemi.com.tr +wearesocial.com +google.st +kemenag.go.id +qiyi.com +gaia.com +transip.nl +gamefront.com +modelhub.com +forecast7.com +songkick.com +coinmill.com +parspack.com +shanghai.gov.cn +haodf.com +eib.org +xhamster2.com +finextra.com +glaad.org +france.fr +rim.or.jp +google.mw +phoenixnewtimes.com +seobook.com +google.gp +danfoss.com +practo.com +stitchfix.com +inaturalist.org +bovada.lv +cvut.cz +mubi.com +nykaa.com +etnews.com +hatenadiary.com +safeway.com +mobhey.com +smartbrief.com +up.ac.za +arkansasonline.com +www.gov.hk +nhaccuatui.com +bhf.org.uk +vitacost.com +cgiar.org +hateblo.jp +google.com.sb +carbonbrief.org +bostonmagazine.com +google.cf +dvdvideosoft.com +huduser.gov +csuchico.edu +online-loan.org +jbl.com +mailorderbrides.dating +uncrate.com +kopilkaurokov.ru +ray-ban.com +adr.org +info.gov.hk +sportsnet.ca +sandiego.edu +cmswire.com +image-line.com +michaelkors.com +flattr.com +bilivideo.com +benjerry.com +liberal.gr +i-mobile.co.jp +presstv.com +mediaad.org +openwrt.org +litmus.com +timesofindia.com +babson.edu +samqaicongen.com +picofile.com +powerapps.com +seg-social.es +diariopanorama.com +lycos.co.uk +wsj.net +bamgrid.com +real.de +forbes.com.mx +amna.gr +villanova.edu +t3.com +newsauto.gr +gumtree.pl +onegreenplanet.org +apc.org +lzu.edu.cn +topnaz.com +hdfilmcehennemi2.pw +defimedia.info +dota2.com +top10chinesedatingsites.com +accaglobal.com +google.co.ls +aau.dk +zumiez.com +digitalcameraworld.com +svoboda.org +straightdope.com +fragrantica.ru +gotquestions.org +essaysrescue.com +cloudapp.net +lucid.app +lacoste.com +warnermediacdn.com +sg-host.com +cp4srvng.xyz +cityofchicago.org +hellogiggles.com +redtube.zone +mouseflow.com +google.tl +f5.com +wishpond.com +wright.edu +justgetflux.com +economia.gov.br +wazirx.com +wv.gov +xbytes.ao +impdesk.com +google.sm +ccidnet.com +tecmundo.com.br +textnow.com +chinabyte.com +unito.it +fresnobee.com +greatergood.com +a2hosting.com +nankai.edu.cn +news12.com +extremereach.io +jvzoo.com +fusion.net diff --git a/src/atextcrawler/config.py b/src/atextcrawler/config.py new file mode 100644 index 0000000..0a07727 --- /dev/null +++ b/src/atextcrawler/config.py @@ -0,0 +1,337 @@ +""" +Configuration loader and validator. +""" + +import os +import re +import sys +from io import TextIOBase +from pathlib import Path +from typing import Any, Optional, Union + +from voluptuous import All +from voluptuous import Any as VAny +from voluptuous import Invalid, Length, Range, Required, Schema, Url +from yaml import load + +try: + from yaml import CLoader as Loader # type: ignore +except ImportError: + from yaml import Loader # type: ignore + + +class ConfigError(Exception): + """ + Application configuration error. + """ + + def __init__(self, err): + self.msg = str(err) + + def __str__(self): + return f'Application configuration error: {self.msg}' + + +class Config: + """ + Application configuration. + + Access the full application configuration using :meth:`get`. + + It is a dictionary with these keys: + + * 'directory': the configuration directory being used + * 'main': the main configuration from main.yaml, but + postgresql configuration may be overriden by environment + variable ATEXTCRAWLER_POSTGRESQL + """ + + config = None + + @classmethod + def get( + cls, + out: Optional[TextIOBase] = None, + ) -> Optional[dict]: + """ + Load and validate app configuration if not already done; return it. + + On errors print them to *out* and if out is sys.stdout, then + also exit with exit code 2. Otherwise just return None. + """ + if cls.config: + return cls.config + if out is None: + out = sys.stdout # type: ignore + _config = _load_config() + msg = None + if isinstance(_config, ConfigError): + msg = f'ERROR: configuration could not be loaded: {_config}' + else: + config = _validate_config(_config) + if isinstance(config, ConfigError): + config_dir = _config.get('config_dir') + msg = ( + f'ERROR: invalid configuration in {config_dir}:' + f' {config}' + ) + if isinstance(_config, ConfigError) or isinstance(config, ConfigError): + print(msg, file=out) + if out == sys.stdout: + sys.exit(2) + else: + return None + config['postgresql']['min_size'] = config['crawl']['workers'] + 2 + config['postgresql']['max_size'] = config['crawl']['workers'] + 2 + cls.config = config + return config + + +def _load_config() -> Union[ConfigError, dict]: + """ + Load configuration; search in multiple directories. + + We search these locations; the first location containing main.yaml + will be used:: + + * a directory defined in environment variable ATEXTCRAWLER_CONF + * subdir .config/atextcrawler in the user's home (`$HOME`) + * /etc/atextcrawler + + In the same directory where this main.conf is located a subdirectory + 'plugins' must exist and contain the configurations of plugins. + + On failure return the first error and None. + Otherwise return None and a dict with these keys: + + * `directory`: the used configuration directory + * `main`: the main application configuration + * `plugins`: a dict mapping plugins names to plugin configurations + """ + Path(__file__).parent.parent + config_dirs = [] + if env_conf := os.environ.get('ATEXTCRAWLER_CONFIG_DIR'): + config_dirs.append(Path(env_conf)) + if env_home := os.environ.get('HOME'): + config_dirs.append(Path(env_home) / '.config' / 'atextcrawler') + config_dirs.append(Path('/etc/atextcrawler')) + for config_dir in config_dirs: + main_yaml_path = config_dir / 'main.yaml' + if main_yaml_path.exists(): + break + else: + locs = ', '.join([str(loc) for loc in config_dirs if loc]) + msg = ( + f'Missing main.yaml in all config locations: {locs}\n' + f'Hint: You may use environment variable ATEXTCRAWLER_CONFIG_DIR' + f' to define a custom config directory.' + ) + return ConfigError(msg) + + # load main.yaml + try: + with main_yaml_path.open() as main_yaml: + main_config = load(main_yaml.read(), Loader=Loader) + except Exception as err: + return ConfigError(f'Invalid YAML in {main_yaml_path}:\n {err}') + + # main_config must be a dict + if not isinstance(main_config, dict): + return ConfigError(f'File {main_yaml_path} must contain a dictionary') + + # postgresql config from environment has precedence + postgresql_config = _get_env_postgresql() + if isinstance(postgresql_config, ConfigError): + return postgresql_config + main_config['postgresql'] = postgresql_config or main_config['postgresql'] + + main_config['config_dir'] = str(config_dir) + return main_config + + +def _get_env_postgresql() -> Union[ConfigError, Optional[dict]]: + """ + Load PostgreSQL config from environment variable ATEXTCRAWLER_POSTGRESQL. + + Return an error or the PostgreSQL config (which can be None if + the environment variable is not defined. + """ + env_var = 'ATEXTCRAWLER_POSTGRESQL' + value = os.environ.get(env_var, '').strip() + if not value: + return None + param_names = ( + 'host', + 'port', + 'database', + 'user', + 'password', + 'schema_name', + ) + re_dsn = re.compile( + '((' + '|'.join(param_names) + ')' + '=("(((?=[^"\\\\]).|\\\\.)*)"' # value in double quotes + '|\'(((?=[^\'\\\\]).|\\\\.)*)\'' # value in single quotes + '|([^"\' ]*)' # value unquoted + ')( |$))+?' + ) + params = {} + for _, varname, _, v1, _, v2, _, v3, _ in re_dsn.findall(value): + params[varname] = ( + v3 + or (v1 or '').replace('\\"', '"') + or (v2 or '').replace("\\'", "'") + ) + if 'host' not in params: + params['host'] = 'localhost' + if 'port' not in params: + params['port'] = '5432' + if 'schema_name' not in params: + params['schema_name'] = 'public' + for name in param_names: + if name not in params: + return ConfigError( + f'Missing {name} in environment variable {env_var}' + ) + else: + params['port'] = int(params['port']) + return params + + +def _validate_config(config: Any) -> Union[ConfigError, dict]: + """ + Validate the given configuration and fill in default values. + + If invalid, return only the first error. + Otherwise return the configuration with added default values. + """ + try: + return schema_main(config) + except Exception as err: + return ConfigError(err) + + +def plugins_dir(config): + """ + Validate plugins directory (absolute or relative path). + + If it is a relative path, prepend the config_dir. + """ + config_dir = config['config_dir'] + plugins_dir = config['plugins_dir'] + if plugins_dir.startswith('/'): + try: + plugins_dir = Path(plugins_dir) + except: + raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found') + else: + try: + plugins_dir = str(Path(config_dir) / Path(plugins_dir)) + config['plugins_dir'] = plugins_dir + except: + raise Invalid(f'Invalid plugins_dir "{plugins_dir}" not found') + if not (Path(plugins_dir) / '__init__.py').exists(): + raise Invalid(f'plugins_dir "{plugins_dir}" has no "__init__.py"') + return config + + +def postgresql_identifier(value): + """ + Validate a PostgreSQL identifier. + """ + if not isinstance(value, str) or not re.match( + '^[a-z][a-z0-9_]{0,30}$', value + ): + raise Invalid( + f'Invalid PostgreSQL identifier "{value}", ' + f'pattern must be: [a-z][a-z0-9_]{0,30}' + ) + return value + + +def positive_number(value): + """ + Validate a positive number (int or float). + """ + if (isinstance(value, int) or isinstance(value, float)) and value > 0: + return value + raise Invalid('Not a positive number') + + +schema_postgresql = Schema( + { + Required('host'): All(str, Length(min=1)), + Required('port', default=5432): All(int, Range(min=0, max=65535)), + Required('database'): All(str, Length(min=1)), + Required('user'): All(str, Length(min=1)), + Required('password'): str, + Required('schema_name', default='public'): postgresql_identifier, + } +) + + +schema_crawl = Schema( + { + Required('workers', default=10): All(int, Range(min=0, max=1000)), + Required('site_delay', default=600): positive_number, + Required('site_revisit_interval', default=3600): positive_number, + Required('resource_delay', default=5): positive_number, + Required('full_crawl_interval', default=864000): positive_number, + Required('feed_crawl_interval', default=86400): positive_number, + } +) + + +schema_elasticsearch = Schema( + { + Required('host'): All(str, Length(min=1)), + Required('api_key'): All(str, Length(min=1)), + Required('id'): All(str, Length(min=1)), + Required('index_base_name'): All(str, Length(min=1)), + } +) + + +schema_tensorflow = Schema( + { + Required('model_server_endpoint'): Url(), + } +) + + +schema_main = Schema( + All( + { + Required('config_dir'): All(str, Length(min=1)), + Required( + 'instance_name', default='atextcrawler' + ): postgresql_identifier, + Required('instance_type', default='prod'): VAny( + 'dev', + 'staging', + 'prod', + ), + Required('log_level', default='info'): VAny( + 'critical', + 'error', + 'warning', + 'info', + 'debug', + ), + Required('plugins_dir', default='plugins'): All( + str, Length(min=1) + ), + Required('postgresql'): schema_postgresql, + Required('crawl'): schema_crawl, + Required('elasticsearch'): schema_elasticsearch, + Required('tensorflow'): schema_tensorflow, + }, + plugins_dir, + ) +) + + +if __name__ == '__main__': + from pprint import pprint + + pprint(Config().get()) diff --git a/src/atextcrawler/crawl.py b/src/atextcrawler/crawl.py new file mode 100644 index 0000000..323e454 --- /dev/null +++ b/src/atextcrawler/crawl.py @@ -0,0 +1,215 @@ +""" +Crawl a site. +""" + +import logging +from datetime import datetime + +import aiohttp + +from .models import Crawl +from .resource import ResourceFetcher, get_site_path, process_site_path +from .site import ( + RobotsInfo, + checkin_site, + checkout_site, + fetch_feeds, + process_site, + update_site, +) +from .tensorflow import TensorFlow + +logger = logging.getLogger(__name__) + + +class CrawlWorker: + """ + Worker fetching sites, crawling their resources and storing statistics. + """ + + def __init__(self, app, worker_number, pool): + self.app = app + self.worker_number = worker_number + self.pool = pool + self.site_delay = self.app.config['crawl']['site_delay'] + self.resource_delay = self.app.config['crawl']['resource_delay'] + self.site = None + self.crawl = None + self.running = True # do crawl + + def __await__(self): + return self.__ainit__().__await__() + + async def __ainit__(self): + await self.startup() + return self + + async def startup(self): + """ + Asynchronous startup. + """ + logger.info(f'Starting worker {self.worker_number}') + self.conn = await self.pool.acquire() + self.session = aiohttp.ClientSession() + self.fetcher = ResourceFetcher(self.session) + self.tf = TensorFlow(self.app, self.session) + + async def shutdown(self): + """ + Asynchronous shutdown. + """ + logger.info(f'Shutting down worker {self.worker_number}') + await self.session.close() + await self.pool.release(self.conn) + + async def run(self): + """ + Worker loop: fetch a site, crawl its resources and store statistics. + + If no site needs to be crawled, sleep for self.site_delay seconds + (configured in crawl.site_delay). + """ + await self.app.sleep(2) + while self.app.running and self.running: + self.site, is_full, more = await checkout_site(self.app, self.conn) + if not self.site: + msg = f'Worker {self.worker_number}: sites exhausted' + logger.debug(msg) + if not more: + await self.app.sleep(self.site_delay) + continue + self.crawl = await get_or_create_crawl( + self.conn, self.site.id_, is_full + ) + try: + if is_full: + site_upd, _ = await update_site( + self.app, + self.fetcher, + self.conn, + self.site.base_url, + site=self.site, + ) + if site_upd and site_upd.crawl_enabled: + self.site = site_upd + await process_site( + self.fetcher, + self.conn, + self.site, + ) + elif self.site.crawl_enabled: + await fetch_feeds(self.fetcher, self.conn, self.site) + if self.site.crawl_enabled: + await self.crawl_resources() + except: + msg = ( + f'Worker {self.worker_number} failed crawl' + f' {self.crawl.id_} of site {self.site.id_}' + f' ({self.site.base_url})' + ) + logger.exception(msg) + await self.crawl.finish( + self.conn, self.app.running and self.running + ) + await checkin_site(self.app, self.conn, self.site, self.crawl) + msg = ( + f'Worker {self.worker_number} finished crawl' + f' {self.crawl.id_}' + ) + logger.debug(msg) + self.site = None + # if we were cancelled, but the app is still running, run again + if self.app.running: + self.running = True + msg = f'Closing crawler {self.worker_number}' + logger.debug(msg) + + async def crawl_resources(self): + """ + Loop over resources of the site and process them. Collect statistics. + + All workers operate on distinct sites, so no need for locking here. + """ + crawl_type = 'full' if self.crawl.is_full else 'feed' + msg = ( + f'Worker {self.worker_number} beginning' + f' {crawl_type} crawl {self.crawl.id_}' + f' of site {self.site.id_} ({self.site.base_url})' + ) + logger.info(msg) + resource_delay = self.resource_delay + robots = await RobotsInfo(self.site.base_url) + if robots.delay: + resource_delay = robots.delay + while self.app.running and self.running: + site_path = await get_site_path( + self.conn, + self.site, + self.crawl.t_begin, + only_new=not self.crawl.is_full, + ) + if not site_path: + msg = ( + f'Worker {self.worker_number} ending crawl' + f' {self.crawl.id_}: paths exhausted' + ) + logger.info(msg) + return + try: + sp_filter = self.app.plugins['filter_site_path'].sp_filter + if sp_filter(self.site, site_path.path, robots): + is_new_resource = await process_site_path( + self.app, + self.worker_number, + self.conn, + self.fetcher, + self.tf, + self.site, + site_path, + ) + if is_new_resource: + self.crawl.n_resources_new += 1 + if is_new_resource is not None: + self.crawl.n_resources += 1 + await self.app.sleep(resource_delay) + else: + sql = ( + "UPDATE site_path SET" + " last_visit=now() at time zone 'UTC'," + " filtered=true" + " WHERE id=$1" + ) + await self.conn.execute(sql, site_path.id_) + except: + msg = ( + f'Worker {self.worker_number} processing path failed' + f' in crawl {self.crawl.id_}: {site_path}' + ) + logger.exception(msg) + site_path.ok_count -= 1 + await site_path.save(self.conn) + msg = ( + f'Worker {self.worker_number}: stopped crawl' f' {self.crawl.id_}' + ) + logger.info(msg) + + +async def get_or_create_crawl(conn, site_id, is_full=True) -> Crawl: + """ + Return a new or existing+unfinished crawl. + + If an existing crawl is found, return it, disregarding whether + it is a full crawl or not. + """ + sql = "SELECT * FROM crawl WHERE site_id=$1 AND t_end is null LIMIT 1" + if row := await conn.fetchrow(sql, site_id): + return await Crawl().load_from_row(row) + else: + # create a new crawl + crawl = Crawl( + site_id=site_id, + is_full=is_full, + t_begin=datetime.utcnow(), + ) + await crawl.save(conn) + return crawl diff --git a/src/atextcrawler/db.py b/src/atextcrawler/db.py new file mode 100644 index 0000000..89bf1c4 --- /dev/null +++ b/src/atextcrawler/db.py @@ -0,0 +1,162 @@ +""" +PostgreSQL connectivity. + +PGPool can be used as context manager. It takes postgresql configuration +parameters and gives a connection pool. +""" + +import logging +import sys +from io import TextIOBase +from pathlib import Path +from traceback import format_exc +from typing import Dict + +import asyncpg + +from .utils.json import json_dumps, json_loads + +logger = logging.getLogger(__name__) + + +class PGPool: + """ + Database connectivity: Provide a connection pool. + + Can be used either as async context manager (giving a pool), + or as a class using async init and the shutdown method and + having the pool attribute. + + After startup self.pool contains a PostgreSQL connection pool + (instance of :class:`asyncpg.pool.Pool`). + + Startup also runs schema migrations (cf. directory `migrations`). + """ + + def __init__( + self, + postgresql_config: dict, + out: TextIOBase = None, + check: bool = True, + ) -> None: + self.conf = postgresql_config + self.out = out or sys.stdout + self.check = check + self.pool = None + + def __await__(self): + return self.__ainit__().__await__() + + async def __ainit__(self): + await self.__aenter__() + return self + + async def __aenter__(self): + """ + Return the connection pool after an optional check. + + The check tests basic database access and runs missing migrations. + If the check fails, return None. + """ + pool_params = { + key: val + for key, val in self.conf.items() + if key + in ( + 'host', + 'port', + 'database', + 'user', + 'password', + 'max_size', + 'min_size', + ) + } + pool_params['command_timeout'] = 30 + self.pool = await asyncpg.create_pool(**pool_params, init=self._init) + if self.check: + async with self.pool.acquire() as conn: + if await self.check_or_migrate(conn): + return self.pool + + @staticmethod + async def _init(conn) -> None: + """ + Add JSON encoding and decoding to the given connection. + """ + await conn.set_type_codec( + 'jsonb', + encoder=json_dumps, + decoder=json_loads, + schema='pg_catalog', + ) + + async def __aexit__(self, exc_type, exc, tb) -> None: + """ + Close the connection pool. + """ + await self.shutdown() + + async def shutdown(self): + """ + Close the pool. + """ + await self.pool.close() + + async def check_or_migrate(self, conn: asyncpg.Connection) -> bool: + """ + Check database connectivity. + + Return whether database connectivity is working. + """ + row = await conn.fetchrow('SELECT 1+1 AS result') + if not row or row.get('result') != 2: + msg = 'Database SELECT 1+1 not working; missing privileges?' + print(msg, file=self.out) + logger.critical(msg) + return False + + # determine current schema_version + try: + sql = "SELECT value::int FROM kvs WHERE key='schema_version'" + schema_version = await conn.fetchval(sql) + except: + schema_version = 0 + + # run missing migrations + migrations = get_migrations() + for number, text in sorted(migrations.items()): + if number > schema_version: + cmds = text.split('\n----\n') + for cmd in cmds: + if not cmd.strip(): + continue + try: + await conn.execute(cmd) + except: + msg = ( + f'Exception during migration {number} in ' + f'statement\n{cmd}' + ) + print(msg, file=self.out) + logger.critical(msg) + print(format_exc(), file=self.out) + logger.critical(format_exc()) + return False + + # return success + return True + + +def get_migrations() -> Dict[int, str]: + """ + Return migrations (number and text content of migration file). + """ + migrations_dir = Path(__file__).parent / 'migrations' + migrations = {} + for migration_file in migrations_dir.glob('*.sql'): + migration_number = int(migration_file.name[:-4]) + with migration_file.open() as mig_file: + content = mig_file.read() + migrations[migration_number] = content + return migrations diff --git a/src/atextcrawler/migrations/1.sql b/src/atextcrawler/migrations/1.sql new file mode 100644 index 0000000..0c4053f --- /dev/null +++ b/src/atextcrawler/migrations/1.sql @@ -0,0 +1,297 @@ +CREATE TABLE kvs ( + id bigserial PRIMARY KEY, + t_update timestamp NOT NULL DEFAULT (now() at time zone 'utc'), + key varchar(200) NOT NULL UNIQUE, + value jsonb +) +---- +COMMENT ON COLUMN kvs.t_update IS 'Time of last update or insert of the entry'; +---- +COMMENT ON COLUMN kvs.key IS 'Key'; +---- +COMMENT ON COLUMN kvs.value IS 'Value'; +---- +COMMENT ON TABLE kvs IS 'Simple key-value store'; +---- +INSERT INTO kvs (key, value) VALUES ('schema_version', '1'); +---- +CREATE TABLE site ( + id bigserial PRIMARY KEY, + canonical_url varchar(200), + base_url varchar(200) NOT NULL, + base_urls varchar(200)[] NOT NULL, + domains varchar(100)[], + ips inet[] NULL, + crawl_enabled bool NOT NULL DEFAULT false, + crawl_active bool NOT NULL DEFAULT false, + next_full_crawl timestamp, + next_feed_crawl timestamp, + last_update timestamp, + last_pub timestamp, + pub_dates jsonb NOT NULL DEFAULT '{}'::jsonb, + langs char(2)[] NOT NULL DEFAULT ARRAY[]::varchar(2)[], + alt_langs jsonb NOT NULL DEFAULT '{}'::jsonb, + title varchar(200), + description varchar(2000), + keywords varchar(50)[] NOT NULL DEFAULT ARRAY[]::varchar(50)[], + linkbacks jsonb NOT NULL DEFAULT '{}'::jsonb, + meta_info jsonb NOT NULL DEFAULT '{}'::jsonb, + boilerplate_texts jsonb NOT NULL DEFAULT '[]'::jsonb +) +---- +CREATE INDEX site__base_url ON site (base_url) +---- +CREATE INDEX site__base_urls ON site (base_urls) +---- +CREATE INDEX site__domains ON site (domains) +---- +CREATE INDEX site__ips ON site (ips) +---- +CREATE INDEX site__next_full_crawl ON site (next_full_crawl) +---- +CREATE INDEX site__next_feed_crawl ON site (next_feed_crawl) +---- +CREATE INDEX site__langs ON site (langs) +---- +CREATE INDEX site__title ON site (title) +---- +CREATE INDEX site__description ON site (description) +---- +CREATE INDEX site__keywords ON site (keywords) +---- +COMMENT ON COLUMN site.base_url IS 'Preferred base URLs (from column base_urls)' +---- +COMMENT ON COLUMN site.base_urls IS 'Base URLs that have been found to return the same content' +---- +COMMENT ON COLUMN site.domains IS 'Domains that have been found to return the same content' +---- +COMMENT ON COLUMN site.ips IS 'IPv4 or IPv6 addresses of the hostnames in base_urls' +---- +COMMENT ON COLUMN site.crawl_enabled IS 'Whether the site is should be indexed' +---- +COMMENT ON COLUMN site.crawl_active IS 'Whether the crawl is in progress' +---- +COMMENT ON COLUMN site.next_full_crawl IS 'Crawl all resources of this site again after this instant of time; do not crawl if null' +---- +COMMENT ON COLUMN site.next_feed_crawl IS 'Crawl the feed resources of this site again after this instant of time; do not crawl if null' +---- +COMMENT ON COLUMN site.last_update IS 'Time of last update of this site (in this database)' +---- +COMMENT ON COLUMN site.last_pub IS 'Estimated time of last content publication on the site' +---- +COMMENT ON COLUMN site.pub_dates IS 'Change history: map visit date to estimated publication date' +---- +COMMENT ON COLUMN site.langs IS 'Languages of the site (ISO 639-1 codes)' +---- +COMMENT ON COLUMN site.alt_langs IS 'Map links to alternative language versions of the site to ISO 639-1 languages codes' +---- +COMMENT ON COLUMN site.title IS 'Title as obtained from title tag or meta tags' +---- +COMMENT ON COLUMN site.description IS 'Description as obtained from meta tags' +---- +COMMENT ON COLUMN site.keywords IS 'Keywords as obtained from meta tags' +---- +COMMENT ON COLUMN site.linkbacks IS 'Map URL to type of linkback (cf. https://en.wikipedia.org/wiki/Linkback)' +---- +COMMENT ON COLUMN site.meta_info IS 'Values from meta tags and other meta information' +---- +COMMENT ON COLUMN site.boilerplate_texts IS 'Boilerplate texts on the startpage and other sample pages' +---- +COMMENT ON TABLE site IS 'Website' +---- +CREATE TABLE site_queue ( + id bigserial PRIMARY KEY, + src bigint NULL REFERENCES site(id) ON DELETE CASCADE, + url varchar(200) NOT NULL, + link_text varchar(100), + t_create timestamp NOT NULL DEFAULT (now() at time zone 'utc') +) +---- +CREATE INDEX site_queue__url ON site_queue (url) +---- +COMMENT ON COLUMN site_queue.src IS 'The id of the linking site; null in case of seeds or manual additions' +---- +COMMENT ON COLUMN site_queue.url IS 'Base URL of site to be assessed, ending with a slash or a mandatory base path' +---- +COMMENT ON COLUMN site_queue.link_text IS 'Text under the anchor tag on the source site' +---- +COMMENT ON COLUMN site_queue.t_create IS 'Creation time of this entry' +---- +COMMENT ON TABLE site_queue IS 'Queued site URLs' +---- +CREATE TABLE site_feed ( + id bigserial PRIMARY KEY, + site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE, + url varchar(200) NOT NULL, + etag text, + modified varchar(50), + t_visit timestamp, + t_content timestamp, + version varchar(10), + title varchar(200), + description text, + fail_count smallint NOT NULL DEFAULT 0 +) +---- +CREATE INDEX site_feed__site ON site_feed (site_id) +---- +CREATE INDEX site_feed__t_content ON site_feed (t_content) +---- +COMMENT ON COLUMN site_feed.site_id IS 'Id of the site on which this feed was found' +---- +COMMENT ON COLUMN site_feed.url IS 'URL of the feed' +---- +COMMENT ON COLUMN site_feed.etag IS 'Etag obtained when requesting the feed' +---- +COMMENT ON COLUMN site_feed.modified IS 'Last-Modified HTTP header value obtained when requesting the feed' +---- +COMMENT ON COLUMN site_feed.t_visit IS 'Time of last retrieval of the feed; null before first retrival' +---- +COMMENT ON COLUMN site_feed.t_content IS 'Time of last content update; null before first retrieval' +---- +COMMENT ON COLUMN site_feed.version IS 'Version of the feed; null before first retrival' +---- +COMMENT ON COLUMN site_feed.title IS 'Title of the feed; null before first retrival' +---- +COMMENT ON COLUMN site_feed.description IS 'Description of the feed; null before first retrival' +---- +COMMENT ON COLUMN site_feed.fail_count IS 'Number of failed retrievals after last successful retrieval; zero before first retrival' +---- +CREATE TABLE site_link ( + id bigserial PRIMARY KEY, + src bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE, + dst bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE, + t_create timestamp NOT NULL DEFAULT (now() at time zone 'utc'), + link_text varchar(100) +) +---- +ALTER TABLE site_link ADD CONSTRAINT site_link_edge UNIQUE (src, dst) +---- +CREATE INDEX site_link__src ON site_link (src) +---- +CREATE INDEX site_link__dst ON site_link (dst) +---- +COMMENT ON COLUMN site_link.src IS 'Source site' +---- +COMMENT ON COLUMN site_link.dst IS 'Destination site' +---- +COMMENT ON COLUMN site_link.t_create IS 'Time of creation of this entry' +---- +COMMENT ON COLUMN site_link.link_text IS 'Text under the anchor tag on the source site' +---- +COMMENT ON TABLE site_link IS 'Cross-site link' +---- +CREATE TABLE resource ( + id bigserial PRIMARY KEY, + simhash bigint, + content_type varchar(50), + last_change timestamp, + text_len int, + lang char(2), + title varchar(200), + summary varchar(2000) +) +---- +COMMENT ON COLUMN resource.simhash IS 'Simhash of the text content of the resource' +---- +COMMENT ON COLUMN resource.content_type IS 'Content type extracted from Content-Type HTTP header' +---- +COMMENT ON COLUMN resource.last_change IS 'Estimated time of the last update of this resource' +---- +COMMENT ON COLUMN resource.text_len IS 'Length of the extracted text in characters' +---- +COMMENT ON COLUMN resource.lang IS 'Language ISO 639-1 code' +---- +COMMENT ON COLUMN resource.title IS 'Title of the resource (used for feed resources)' +---- +COMMENT ON COLUMN resource.summary IS 'Content summary of the resource (used for feed resources)' +---- +COMMENT ON TABLE resource IS 'Text resource (may be reachable by more than one path of a site)' +---- +CREATE TABLE site_path ( + id bigserial PRIMARY KEY, + site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE, + path varchar(400) NOT NULL, + last_visit timestamp, + filtered bool NOT NULL DEFAULT false, + ok_count smallint NOT NULL DEFAULT 0, + canonical bool, + resource_id bigint REFERENCES resource(id) ON DELETE CASCADE +) +---- +ALTER TABLE site_path ADD CONSTRAINT site_path__unique UNIQUE (site_id, path) +---- +CREATE INDEX site_path__site_path ON site_path (site_id, path) +---- +CREATE INDEX site_path__resource ON site_path (resource_id) +---- +COMMENT ON COLUMN site_path.site_id IS 'Site id' +---- +COMMENT ON COLUMN site_path.path IS 'Path' +---- +COMMENT ON COLUMN site_path.last_visit IS 'Time of last retrieval of the resource; null before first retrival' +---- +COMMENT ON COLUMN site_path.ok_count IS 'Increased by 1 for every successful retrieval of the resource and decreased by 1 for every failed' +---- +COMMENT ON COLUMN site_path.canonical IS 'Whether the path is the canonical one for the resource; null before first retrival' +---- +COMMENT ON COLUMN site_path.resource_id IS 'Resource id; null before first retrieval' +---- +COMMENT ON TABLE site_path IS 'Paths of a site pointing to text resources' +---- +CREATE TABLE crawl ( + id bigserial PRIMARY KEY, + site_id bigint NOT NULL REFERENCES site(id) ON DELETE CASCADE, + is_full bool NOT NULL DEFAULT false, + t_begin timestamp, + t_end timestamp, + n_resources int NOT NULL DEFAULT 0, + n_resources_new int NOT NULL DEFAULT 0 +) +---- +CREATE INDEX crawl__site ON crawl (site_id) +---- +CREATE INDEX crawl__t_begin ON crawl (t_begin) +---- +COMMENT ON COLUMN crawl.site_id IS 'Site that is being crawled' +---- +COMMENT ON COLUMN crawl.is_full IS 'Whether the crawl is a full crawl; if not it is a feed crawl' +---- +COMMENT ON COLUMN crawl.t_begin IS 'Begin time of the crawl' +---- +COMMENT ON COLUMN crawl.t_end IS 'End time of the crawl; if t_end is null resuming a crawl will fetch all resources with last_visit before t_begin' +---- +COMMENT ON COLUMN crawl.n_resources IS 'Number of resources that were fetched during the crawl' +---- +COMMENT ON COLUMN crawl.n_resources_new IS 'Number of new resources found during the crawl' +---- +COMMENT ON TABLE resource IS 'Crawl of resources on a site' +---- +CREATE TYPE site_annotation_type AS ENUM ('whitelist', 'blacklist', 'suggestion', 'review', 'audience', 'location', 'themes', 'timescale') +---- +COMMENT ON TYPE site_annotation_type IS 'Type of site annotation' +---- +CREATE TABLE site_annotation ( + id bigserial PRIMARY KEY, + site_id bigint REFERENCES site(id) ON DELETE SET NULL, + base_url varchar(200) NOT NULL, + ann_type site_annotation_type NOT NULL, + ann_content JSONB, + t_update timestamp NOT NULL DEFAULT (now() at time zone 'utc') +) +---- +CREATE INDEX site_annotation__site ON site_annotation (site_id) +---- +CREATE INDEX site_annotation__base_url ON site_annotation (base_url) +---- +COMMENT ON COLUMN site_annotation.site_id IS 'Site that is being annotated' +---- +COMMENT ON COLUMN site_annotation.base_url IS 'Base URL of the site being annotated' +---- +COMMENT ON COLUMN site_annotation.ann_type IS 'Annotation type' +---- +COMMENT ON COLUMN site_annotation.ann_content IS 'Annotation content' +---- +COMMENT ON COLUMN site_annotation.t_update IS 'Time of last update' +---- +COMMENT ON TABLE site_annotation IS 'Manual annotations on a site' diff --git a/src/atextcrawler/models.py b/src/atextcrawler/models.py new file mode 100644 index 0000000..934b791 --- /dev/null +++ b/src/atextcrawler/models.py @@ -0,0 +1,610 @@ +""" +Data Models. +""" + +import logging +from dataclasses import InitVar, asdict, dataclass, field, fields +from datetime import date, datetime +from itertools import chain +from typing import Any, ClassVar, Optional + +import tldextract +from asyncpg import Connection + +from .search import delete_resource +from .utils.durl import Durl, get_url_variants +from .utils.link import extract_domain +from .utils.similarity import get_simhash, simhash_to_bigint + +logger = logging.getLogger(__name__) + + +class ModelBase: + """ + Abstract base class for models. + + Execute SQL to load, save, delete instances using asyncpg. + """ + + table: ClassVar + id_: Optional[int] = 0 + + async def load(self, conn: Connection, id_: int) -> Optional[Any]: + """ + If loading fails, return None. + """ + sql = f"SELECT * FROM {self.table} WHERE id=$1" + row = await conn.fetchrow(sql, id_) + if not row: + return None + return await self.load_from_row(row) + + async def load_from_row(self, row): + """ + If row is None, return None. + """ + if not row: + return None + data = dict(row) + self.id_ = data.pop('id') + self.__init__(**data) + return self + + async def save(self, conn: Connection) -> None: + """ + Save the instance (update if self.id_ is set, else insert). + """ + data = asdict(self) + # logger.debug(f'Save {self}: id_={self.id_}') + if self.id_: # update + cols = ', '.join(data.keys()) + upds = ', '.join( + [f'{col}=${i + 1}' for i, col in enumerate(data.keys())] + ) + val_id = f'${len(data) + 1}' + sql = f"UPDATE {self.table} SET {upds} WHERE id={val_id}" + await conn.execute(sql, *data.values(), self.id_) + else: # insert + cols = ', '.join(data.keys()) + vals = ', '.join([f'${i + 1}' for i in range(len(data))]) + sql = ( + f"INSERT INTO {self.table} ({cols}) VALUES ({vals})" + f" RETURNING id" + ) + self.id_ = await conn.fetchval(sql, *data.values()) + + def asdict(self): + """ + Return instance data as dictionary. + """ + return asdict(self) + + async def delete(self, conn: Connection) -> None: + """ + Delete the object if it has an id_. + """ + if self.id_: + sql = f"DELETE FROM {self.table} WHERE id=$1" + await conn.execute(sql, self.id_) + + +class ResourceError: + """ + Error encountered while trying to fetch a resource. + + ResourceError is used for cases when fetching a resource fails. + """ + + def __init__(self, msg, status=None, headers=None): + self.msg = msg + self.status = status + self.headers = headers + + def __repr__(self): + return f'ResourceError: {self.msg}' + + +class ResourceRedirect: + """ + A resource containing a redirect. + """ + + def __init__(self, urls): + self.urls = urls + + +@dataclass +class TextResource(ModelBase): + """ + TextResource (without path). + + TextResource models web resources with relevant text content. + They are instantiated in modules page, document, ...; their metadata + are stored in table `resource` and the text content is stored with the + search engine. + + Do not confuse with SitePath: Several SitePath instances + may point to a TextResource. The TextResource holds the actual content. + + If we are not dealing with the startpage of a new site, + the init_fields dict usually will contain the site to which + the resource belongs. + """ + + table: ClassVar = 'resource' + init_fields: InitVar[dict] = None # additional fields after fetching + search_fields: InitVar[dict] = None # additional fields for indexing + + # database fields + simhash: Optional[int] = None + content_type: Optional[str] = None + last_change: Optional[datetime] = None + text_len: int = 0 + lang: Optional[str] = None + title: Optional[str] = None + summary: Optional[str] = None + + def __post_init__(self, init_fields, search_fields): + if init_fields is None: + init_fields = {} + self.init_fields = init_fields + if search_fields is None: + search_fields = {} + self.search_fields = search_fields + self.site = self.init_fields.get('site') + self.site_id = self.site.id_ if self.site else None + self._update_simhash() + + def __str__(self): + return ( + f'TextResource(id={self.id_},' + f' site_id={self.site_id},' + f' type={self.content_type})' + ) + + def _update_simhash(self): + """ + Update the simhash of the resource from its text content. + """ + if self.simhash is None: + text = self.search_fields.get('text', '') + self.simhash = simhash_to_bigint(get_simhash(text)) + + async def save(self, conn: Connection): + """ + Save the instance, extending the parent's method. + """ + self.content_type = ( + self.content_type[:50] if self.content_type else None + ) + self.title = self.title[:200] if self.title else None + self.summary = self.summary[:400] if self.summary else None + self._update_simhash() + if self.last_change is None: + self.last_change = datetime.utcnow() + await super().save(conn) + + async def update_from_resource(self, upd: 'TextResource'): + """ + Update self with values from another resource. + """ + names = [field.name for field in fields(self)] + for name in names: + cur_val = getattr(self, name) + upd_val = getattr(upd, name) + if not cur_val and upd_val is not None: + setattr(self, name, upd_val) + init_names = [ + 'headers', + 'redirects', + 'links_int', + 'links_ext', + 'shortlinks', + 'canonical', + #'head', + ] + self.init_fields = upd.init_fields + self.search_fields = upd.search_fields + # for init_name in init_names: + # cur_val = self.init_fields.get(init_name) + # upd_val = upd.init_fields.get(init_name) + # if not cur_val and upd_val is not None: + # self.init_fields[init_name] = upd_val + + +@dataclass +class MetaResource(ModelBase): + """ + Parent class for Feed, Sitemap, SitemapIndex. + + MetaResource is a parent class for Feed, Sitemap, SitemapIndex. + Their instances are not stored. Note: class Feed contains feed meta data + and is stored in the database. + """ + + +@dataclass +class SitemapIndex(MetaResource): + """ + A SitemapIndex meta resource. + + Just a list of the siteap URLs, nothing more. + """ + + sitemaps: list = field(default_factory=list) + + +@dataclass +class Sitemap(MetaResource): + """ + A Sitemap meta resource. + + Just a list of the resulting links, nothing more. + """ + + urls: list = field(default_factory=list) + + +@dataclass +class Feed(MetaResource): + """ + A site's feed (RSS, Atom , ...). + """ + + table: ClassVar = 'site_feed' + entries: InitVar[list] = None + site_id: Optional[int] = None + url: Optional[str] = None + etag: Optional[str] = None + modified: Optional[str] = None + t_visit: Optional[datetime] = None + t_content: Optional[datetime] = None + version: Optional[str] = None + title: Optional[str] = None + description: Optional[str] = None + fail_count: int = 0 + + def __post_init__(self, entries): + self.entries = entries + + def __str__(self): + return f'Feed(id={self.id_}, site_id={self.site_id}, url={self.url})' + + async def save(self, conn: Connection): + """ + Save, trying to merge with existing entry matching on site_id and url. + """ + if not self.site_id or not self.url: + msg = f'Saving feed failed: missing site_id of url' + logger.error(msg) + return + sql = "SELECT id FROM site_feed WHERE site_id=$1 AND url=$2" + self.id_ = await conn.fetchval(sql, self.site_id, self.url) + await super().save(conn) + + def debug(self) -> str: + """ + Return the instance data asa string for debug print output. + """ + return ( + f'Feed:\n' + f'- id: {self.id_}\n' + f'- site_id: {self.site_id}\n' + f'- url: {self.url}\n' + f'- etag: {self.etag}\n' + f'- modified: {self.modified}\n' + f'- t_visit: {self.t_visit}\n' + f'- t_content: {self.t_content}\n' + f'- version: {self.version}\n' + f'- title: {self.title}\n' + f'- description: {self.description}\n' + f'- fail_count: {self.fail_count}\n' + f'- entries: {self.entries}' + ) + + +@dataclass +class Site(ModelBase): + """ + Website. + """ + + table: ClassVar = 'site' + base_durl: InitVar[Durl] = None + feeds: InitVar[dict] = None + links_ext: InitVar[dict] = None + links_int: InitVar[dict] = None + startpage_text: InitVar[str] = None + + canonical_url: Optional[str] = None + base_url: Optional[str] = None + base_urls: list[str] = field(default_factory=list) + domains: list[str] = field(default_factory=list) + ips: Optional[list[str]] = None + crawl_enabled: bool = False + crawl_active: bool = False + next_full_crawl: Optional[datetime] = None + next_feed_crawl: Optional[datetime] = None + last_update: Optional[datetime] = None + last_pub: Optional[datetime] = None + pub_dates: Optional[dict[str, str]] = None + langs: list[str] = field(default_factory=list) + alt_langs: dict[str, str] = field(default_factory=dict) + title: Optional[str] = None + description: Optional[str] = None + keywords: list[str] = field(default_factory=list) + linkbacks: dict[str, str] = field(default_factory=dict) + meta_info: dict = field(default_factory=dict) + boilerplate_texts: list[str] = field(default_factory=list) + + def __post_init__( + self, + base_durl: Durl, + feeds=None, + links_ext=None, + links_int=None, + startpage_text=None, + ): + self.feeds = feeds + self.links_ext = links_ext + self.links_int = links_int + self.startpage_text = startpage_text + self.keywords = self.keywords[:20] + if not self.last_update: + self.last_update = datetime.utcnow() + pub_date: Optional[str] + if self.last_pub: + pub_date = date.isoformat(self.last_pub.date()) + self.pub_dates = {date.isoformat(self.last_update): pub_date} + else: + pub_date = None + self.pub_dates = {} + if base_durl: + self.base_urls = [base_durl.url()[:200]] + self.domains = [extract_domain(base_durl.hostname)[:100]] + + def __str__(self): + return ( + f'Site(id={self.id_}, url={self.base_url},' + f' crawl_enabled={self.crawl_enabled})' + ) + + async def update_base_url(self) -> None: + """ + Update the base_url, choosing the most relevant URL. + + If canonical_url is not None, use this. + Otherwise set self.base_url to the shortest from self.base_urls, + but requiring a https-url if there is at least one. + """ + if self.canonical_url and self.canonical_url not in self.base_urls: + if canonical_durl := await Durl(self.canonical_url): + self.base_urls.append(self.canonical_url) + domain = extract_domain(canonical_durl.hostname) + if domain not in self.domains: + self.domains.append(domain) + if self.canonical_url: + self.base_url = self.canonical_url + return + if not self.base_url: + url_candidates = self.base_urls + if https_urls := [ + url for url in self.base_urls if url.startswith('https://') + ]: + url_candidates = https_urls + self.base_url = min(url_candidates, key=len) + + async def save( # type: ignore + self, conn, merge=True + ) -> tuple[Optional[int], bool]: + """ + Store the site, optionally trying to merge it with an existing site. + + Return the id of the saved instance and whether a new instance + was created. + + If self.id_ is not 0, replace the data of the existing site with + this id. Else if not merge, store as new row, and if merge, + try to merge with an existing matching site. + """ + await self.update_base_url() + if not merge: + created = not bool(self.id_) + await super().save(conn) + return self.id_, created + if self.id_: + sql = "SELECT base_urls, pub_dates FROM site WHERE id=$1" + row = await conn.fetchrow(sql, self.id_) + self.base_urls = list( + set(row['base_urls']).union(set(self.base_urls)) + ) + if previous_pub_dates := row['pub_dates']: + if not self.pub_dates: + self.pub_dates = {} + self.pub_dates.update(previous_pub_dates) + await super().save(conn) + return self.id_, False + same_site_id = await search_same_site(self, conn) + if same_site_id: + same_site = await Site().load(conn, same_site_id) + if same_site_id and same_site: + same_site.base_urls = set(same_site.base_urls).union( + set(self.base_urls) + ) + same_site.domains = set(same_site.domains).union(set(self.domains)) + if self.canonical_url and not same_site.canonical_url: + same_site.canonical_url = self.canonical_url + await same_site.save(conn, merge=False) # call ourselves + self.id_ = same_site.id_ + return self.id_, False + else: + await super().save(conn) + return self.id_, True + + +@dataclass +class SitePath(ModelBase): + """ + Path of a website. May point to a Resource. + """ + + table: ClassVar = 'site_path' + site: InitVar[str] = None + + site_id: Optional[int] = None + path: Optional[str] = None + filtered: bool = False + last_visit: Optional[datetime] = None + ok_count: int = 0 + canonical: Optional[bool] = None + resource_id: Optional[int] = None + + def __str__(self): + return ( + f'SitePath(id={self.id_}, site_id={self.site_id},' + f' path={self.path})' + ) + + async def save(self, conn: Connection): + """ + Save the instance, extending the parent's method. + """ + self.path = self.path[:400] if self.path else '' + await super().save(conn) + + async def unlink_resource(self, conn, engine, index_base_name): + """ + Unlink the resource and also delete it, if it has no more links. + """ + if self.id_: + if self.resource_id: + sql = "SELECT COUNT(*) FROM site_path WHERE resource_id=$1" + ref_count = await conn.fetchval(sql, self.resource_id) + if ref_count == 0: + sql = ( + "DELETE FROM resource WHERE id=$1" + " RETURNING (true, lang)" + ) + found = await conn.fetchval(sql, self.resource_id) + if found: + await delete_resource( + engine, found[1], self.resource_id + ) + self.resource_id = None + + def url(self, site): + """ + Return the full URL (combine the site's base_url with our path). + """ + return site.base_url + self.path + + +@dataclass +class Crawl(ModelBase): + """ + The crawl process of a website (begin, end, statistics, ...). + """ + + table: ClassVar = 'crawl' + site_id: Optional[int] = None + is_full: bool = False + t_begin: datetime = datetime.utcnow() + t_end: Optional[datetime] = None + n_resources: int = 0 + n_resources_new: int = 0 + + async def finish(self, conn, set_t_end): + """ + Save the crawl. Set t_end only if indicated. + """ + if set_t_end: + self.t_end = datetime.utcnow() + await self.save(conn) + + +async def search_same_site( + site: Site, + conn: Connection, +) -> Optional[int]: + """ + Try to find a matching site for the given *site* and return its id. + + TODO: if the path is non-trivial, require it also for the matching site + + Two sites match when they return the same content for identical paths. + The base_url (scheme and/or netloc) may differ. + We do not have the content for all paths of both websites, so we need + to estimate: We only take into account meta information from the + start pages of both sites, in particular the title, description + and information obtained the base_urls: + + We use a combination of these conditions: + + 1. one of the sites has a canonical URL which matches the + URL of the other site + 2. the content fields (title, description) have sufficient information + 3. the content fields match exactly + 4. the domain matches + 5. the domain matches, except for the TLD + 6. the base_urls differ in their schemes (http vs. https) + 7. the hostnames in the base_urls are identical + 8. the hostnames in the base_urls differ by a prepended 'www.' + 9. the IPs have at least one common address + + The algorithm is this (first answer is final, yes means match): + + * if (1) : yes + * if (2), (3), (4) : yes + * if (2), (3), (5), (9) : yes + * if (6), ((7) or (8)) : yes + * no + """ + # rule (1) + if site.canonical_url: + sql = "SELECT id FROM site WHERE $1=ANY(base_urls) LIMIT 1" + id_ = await conn.fetchval(sql, site.canonical_url) + if id_: + return id_ + else: + sql = "SELECT id FROM site WHERE canonical_url=ANY($1) LIMIT 1" + id_ = await conn.fetchval(sql, site.base_urls) + if id_: + return id_ + + # rule (6), ((7) or (8)) + url_variants = set( + chain.from_iterable( + get_url_variants(base_url) for base_url in site.base_urls + ) + ) + sql = f"SELECT id FROM site WHERE base_urls && $1 LIMIT 1" + if id_ := await conn.fetchval(sql, url_variants): + return id_ + + # condition (2) + if len(site.title or '') > 15 or len(site.description or '') > 15: + sql = ( + f"SELECT * FROM site WHERE" + f" COALESCE(title, '')=$1 AND COALESCE(description, '')=$2" + ) + rows = await conn.fetch(sql, site.title or '', site.description or '') + # condition (3) + if rows: + # condition (4) + for row in rows: + domains = set(row.get('domains', [])) + if domains & set(site.domains): + return row['id'] + # condition (9) + for row in rows: + ips = set(row.get('ips', [])) + if site.ips and ips & set(site.ips): + # condition (5) + domains_ = row.get('domains', []) + d1 = set([tldextract.extract(d).domain for d in domains_]) + domains_ = site.domains or [] + d2 = set([tldextract.extract(d).domain for d in domains_]) + if d1 & d2: + return row['id'] + + return None diff --git a/src/atextcrawler/plugin_defaults/__init__.py b/src/atextcrawler/plugin_defaults/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/atextcrawler/plugin_defaults/filter_resource_path.py b/src/atextcrawler/plugin_defaults/filter_resource_path.py new file mode 100644 index 0000000..cb84e22 --- /dev/null +++ b/src/atextcrawler/plugin_defaults/filter_resource_path.py @@ -0,0 +1,22 @@ +""" +Filter paths found in a resource. + +This plugin implements :func:`rp_filter`. +""" + +from typing import Optional + + +def rp_filter(site, durl) -> Optional[str]: + """ + Adjust or filter found paths (may depend on site). + + To filter out a path (i.e., not add it to table `site_path`) + return None. + """ + path = durl.pwa() + # skip fetching images (linked from a tags; img tags are skipped anyway) + if path.lower().endswith('.jpg') or path.lower().endswith('.png'): + return None + path = path.removesuffix('?amp=1') + return path diff --git a/src/atextcrawler/plugin_defaults/filter_site.py b/src/atextcrawler/plugin_defaults/filter_site.py new file mode 100644 index 0000000..09b2282 --- /dev/null +++ b/src/atextcrawler/plugin_defaults/filter_site.py @@ -0,0 +1,47 @@ +""" +Relevance estimation of sites. + +This plugin implements :func:`site_filter`. +""" + +import re + +from atextcrawler.models import Site + +MIN_RELEVANCE_SCORE = 5 + + +async def site_filter(site: Site) -> bool: + """ + Assess relevance of the site (using language-dependent criteria). + + If the site shall be crawled, return True, else False. + """ + # limit to sites in English or German language + if not set(['de', 'en']) & set(site.langs): + return False + score = 0.0 + for crit_name, weight, langs, crit_re in re_criteria: + if '*' in langs or set(langs) & set(site.langs): + findings = crit_re.findall(site.startpage_text) + if findings: + score += weight * len(findings) + if site.title and crit_re.search(site.title): + score += 4 * weight + if site.description and crit_re.search(site.description): + score += 4 * weight + + # TODO: add criteria for named entities (FdA-IFA, FAU, ...) + + return score >= MIN_RELEVANCE_SCORE + + +re_criteria = { + ( + 'anarch', + 1.0, + ('*',), + re.compile('((? bool: + """ + Per-site path filter. Return whether the path shall be retrieved. + """ + if not robots.can_fetch_url(site.base_url + path): + return False + if 'amusewiki' in site.meta_info.get('generator', '').lower(): + if any( + [ + path.endswith(end) + for end in ('.html', '.epub', '.tex', '.zip', '.pdf') + ] + ): + return False + if '/bbselect?' in path: + return False + return True diff --git a/src/atextcrawler/resource/__init__.py b/src/atextcrawler/resource/__init__.py new file mode 100644 index 0000000..f6aee1d --- /dev/null +++ b/src/atextcrawler/resource/__init__.py @@ -0,0 +1,10 @@ +from .dedup import store_boilerplate_texts +from .feed import feed_types, update_feed +from .fetch import ResourceFetcher +from .operations import ( + add_site_paths, + get_site_path, + process_site_path, + store_feed_entries, +) +from .sitemap import extract_sitemap_paths, get_sitemap_urls diff --git a/src/atextcrawler/resource/__main__.py b/src/atextcrawler/resource/__main__.py new file mode 100644 index 0000000..1542dfd --- /dev/null +++ b/src/atextcrawler/resource/__main__.py @@ -0,0 +1,96 @@ +""" +Dev tool for fetching and displaying a resource. + +Has no permanent effects. +""" + +import asyncio +import logging +import sys +from collections import defaultdict +from pprint import pformat + +import aiohttp + +from ..models import Feed, TextResource +from ..resource import ResourceFetcher +from ..utils.annotation import pack_annotations, unpack_annotations +from ..utils.durl import Durl + +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) +logger.addHandler(logging.StreamHandler()) +logger_page_debug = logging.getLogger('atextcrawler.resource.page.debug') +logger_page_debug.setLevel(logging.DEBUG) + + +def add_tags(text, annotations): + """ + Reconstruct html from text and annotations. + + This is very similar to what the client does when displaying + a cached hit. + """ + html = '' + opening_tags = defaultdict(list) + closing_tags = defaultdict(list) + anns_tags = sorted( + annotations['tags'].items(), key=lambda x: (x[0][0], -x[0][1]) + ) + for (i, f), anns in anns_tags: + opening_tags[i] += [tag for tag in reversed(anns)] + closing_tags[f] += [tag for tag in reversed(anns)] + positions = sorted(set(opening_tags.keys()) | set(closing_tags.keys())) + last_pos = 0 + links = {i: href for href, (i, f, rel) in annotations['links'].items()} + for pos in positions: + html += text[last_pos:pos] + closing = closing_tags.get(pos, []) + opening = opening_tags.get(pos, []) + common = set(closing) & set(opening) + closing = [tag for tag in closing if tag not in common] + opening = [tag for tag in opening if tag not in common] + tags_html = '' + for tag in reversed(closing): + html += f'\n' + for tag in opening: + if tag == 'a': + href = links.get(pos, '#') + html += f'' + else: + html += f'<{tag}>' + last_pos = pos + return html + + +async def run(): + """ + Fetch and display a resource with URL given as cmdline argument. + """ + url = sys.argv[1] + async with aiohttp.ClientSession() as session: + if not (durl := await Durl(url)): + return + fetcher = ResourceFetcher(session) + resource = await fetcher.fetch(url) + if isinstance(resource, TextResource): + logger.warning(repr(resource)) + logger.warning(f'Language: {resource.lang}') + logger.warning(pformat(resource.search_fields)) + logger.warning(pformat(resource.init_fields)) + + # annotations = resource.search_fields.get('annotations') + # text = resource.search_fields['text'] + # with open('/tmp/1.html', 'w') as f: + # html = add_tags(text, annotations) + # f.write(f'\nhhh' + # f'\n{html}\n') + elif isinstance(resource, Feed): + logger.warning(resource.debug()) + else: + logger.warning(f'Resource has type {type(resource)}') + logger.warning(resource) + + +if __name__ == '__main__': + asyncio.run(run()) diff --git a/src/atextcrawler/resource/dedup.py b/src/atextcrawler/resource/dedup.py new file mode 100644 index 0000000..54998dc --- /dev/null +++ b/src/atextcrawler/resource/dedup.py @@ -0,0 +1,59 @@ +""" +Find boilerplate texts. +""" + +from collections import Counter + +from ..models import TextResource +from ..utils.probe import extract_samples +from ..utils.section import iter_sections + + +async def store_boilerplate_texts(fetcher, conn, site): + """ + Find and store boilerplate texts of a site. + + Fetch the start page and internal sample links obtained from it. + If there are sufficienty frequently appearing text sections, + consider them as boilerplate texts. + + If boilerplate_texts were found, update the given site instance. + """ + startpage = await fetcher.fetch(site.base_url, site=site) + if ( + not isinstance(startpage, TextResource) + or startpage.content_type != 'html' + ): + return + + # fetch sample resources + sample_links = extract_samples(startpage.init_fields['links_int']) + resources = [startpage] + for sample_link in sample_links: + if sample_link.path == site.base_url: # avoid duplicate resources + continue # NB: duplicate resources may have different paths + sample_resource = await fetcher.fetch(sample_link.url(), site=None) + if ( + isinstance(sample_resource, TextResource) + and sample_resource.content_type == 'html' + ): + resources.append(sample_resource) + + # find common texts in resources + if (n_resources := len(resources)) > 2: + text_freq = Counter() + for resource in resources: + text = resource.search_fields['text'] + semantic_breaks = resource.search_fields['annotations'][ + 'semantic_breaks' + ] + for sec in iter_sections(text, semantic_breaks): + text_freq[sec[3]] += 1 + boilerplate_texts = [] + if min(text_freq.values() or [0]) == 1: # no resource fetched twice + for text, freq in text_freq.items(): + if freq > 2: + boilerplate_texts.append(text) + sql = "UPDATE site SET boilerplate_texts=$1 WHERE id=$2" + await conn.execute(sql, boilerplate_texts, site.id_) + site.boilerplate_texts = boilerplate_texts diff --git a/src/atextcrawler/resource/document.py b/src/atextcrawler/resource/document.py new file mode 100644 index 0000000..4284465 --- /dev/null +++ b/src/atextcrawler/resource/document.py @@ -0,0 +1,131 @@ +""" +Parse documents (often application/pdf). +""" + +import logging +import re +from datetime import datetime +from typing import Optional, Union + +from tika import parser + +from ..models import ResourceError, ResourceRedirect, Site, TextResource +from ..utils.durl import Durl +from ..utils.http import get_header_links +from ..utils.lang import extract_content_language +from .plaintext import annotate_text + +logger = logging.getLogger(__name__) +logger_debug = logging.getLogger(__name__ + '.debug') +logger_debug.setLevel(logging.INFO) + + +re_url = re.compile( + r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?' + r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)' +) + + +async def parse_document( + durl: Durl, + resp: dict, + site: Optional[Site], +) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]: + """ + Extract plain text from documents in various formats. + """ + content = resp['content'] + + # HTTP headers, canonical URL, shortlink + header_links = await get_header_links(resp['headers'], durl, site) + if canonical := header_links.get('canonical'): + if canonical != durl.url(): + return ResourceRedirect(resp['redirects'] + [canonical]) + shortlink = header_links.get('shortlink') + + # use tika to extract text + doc = parser.from_buffer(content) + # logger.debug(pformat(doc)) + if doc.get('status') != 200: + msg = f'Analyzing document failed: {durl.url()}' + return ResourceError(msg) + + # collect meta data + meta = doc.get('metadata', {}) + content_type = meta.get('Content-Type') + if isinstance(content_type, list): + content_type = content_type[-1] + title = concat(meta.get('title')) + concat(meta.get('creator')) + last_change = extract_latest(meta.get('date') or meta.get('created')) + keywords = None + + # text content + text = (doc.get('content') or '').strip() + + # links + links_int: dict[Durl, tuple[list[str], str]] = {} + links_ext: dict[Durl, tuple[list[str], str]] = {} + for url in re_url.findall(text): + link_durl = await Durl(url[0]) + if link_durl: + if link_durl.site() == durl.site(): + links_int[link_durl] = [], link_durl.url() + else: + links_ext[link_durl] = [], link_durl.url() + + # annotations + text, annotations = annotate_text(text) + + return TextResource( + content_type=content_type, + last_change=last_change, + text_len=len(text), + lang=extract_content_language(text), + title=title, + init_fields={ + 'durl': durl, + 'site': site, + 'headers': resp['headers'], + 'redirects': resp['redirects'], + 'links_int': links_int, + 'links_ext': links_ext, + 'shortlink': shortlink, + 'canonical': None, + }, + search_fields={ + 'title': title, + 'pub_date': last_change, + 'keywords': keywords, + 'text': text, + 'annotations': annotations, + }, + ) + + +def extract_latest(s: Optional[Union[str, list]]) -> Optional[datetime]: + """ + Extract the lastest date (if any) from a string or list of strings. + """ + if not s: + return None + if not isinstance(s, list): + s = [s] + dt = [] + for t in s: + try: + dt.append(datetime.fromisoformat(t.rstrip('Z'))) + except: + pass + return max(dt) if dt else None + + +def concat(s: Optional[Union[str, list]]) -> Optional[str]: + """ + Helper function for joining strings together. + """ + if not s: + return None + if not isinstance(s, list): + s = [s] + return ' '.join(s) diff --git a/src/atextcrawler/resource/feed.py b/src/atextcrawler/resource/feed.py new file mode 100644 index 0000000..c7713bd --- /dev/null +++ b/src/atextcrawler/resource/feed.py @@ -0,0 +1,155 @@ +""" +Stuff related to feeds. + +Higher-level stuff is in site.feeds. +""" + +import logging +from datetime import datetime, timezone +from typing import Optional, Union + +from asyncpg import Connection +from feedparser import parse + +from ..models import Feed, MetaResource, ResourceError +from ..utils.durl import Durl + +logger = logging.getLogger(__name__) + + +feed_types = ( + 'application/rss+xml', + 'application/atom+xml', + 'application/feed+json', +) + + +async def update_feed(fetcher, feed, conn) -> Optional[list[dict]]: + """ + Fetch, parse and return a given feed's content. Also update *feed*. + + If the server replied with HTTP 410, delete the feed. + If there is no new information (server replied with HTTP 304), + return None. For other errors also return None and increase the + fail_count. + """ + headers = {'Cache-control': 'max-age=600'} + if feed.modified: + headers['If-Modified-Since'] = feed.modified + elif feed.etag: + headers['If-None-Match'] = feed.etag.removeprefix('W/') + resource = await fetcher.fetch(feed.url, headers=headers) + if isinstance(resource, ResourceError): + if resource.status == 410: + msg = f'Feed has vanished, deleting it: {feed}' + logger.debug(msg) + await feed.delete(conn) + if resource.status != 304: + feed.fail_count += 1 + if feed.fail_count > 5: + msg = f'Feed not reachable, deleting it: {feed}' + logger.debug(msg) + await feed.delete(conn) + return None # HTTP 304, no new entries + elif isinstance(resource, Feed): + resource.id_ = feed.id_ + resource.site_id = feed.site_id + await resource.save(conn) + return resource.entries + else: + return None + + +def parse_json_feed(resp, data: dict) -> Feed: + """ + Parse a JSON response for jsonfeed information. + + TODO: handle 'next_url' (see https://jsonfeed.org/version/1.1) + """ + feed = Feed() + feed.url = data.get('feed_url', resp['redirects'][-1]) + feed.etag = resp['headers'].get('ETag') + feed.modified = resp['headers'].get('Last-Modified') + feed.t_visit = datetime.utcnow() + version = data.get('version', '') + version = 'json-' + version.removeprefix('https://jsonfeed.org/version/') + feed.version = version[:10] + feed.title = data.get('title') + feed.description = data.get('description') + feed.fail_count = 0 + entries = [] + latest = None + # parse feed entries to a dict compatible with feedparser's entries + for feed_item in data.get('items', []): + entry = {} + entry['link'] = feed_item.get('url') + dt = feed_item.get('date_published') + if dt: + dt = datetime.fromisoformat(dt) if dt else None + dt = dt.astimezone(tz=None).replace(tzinfo=timezone.utc) + entry['published_parsed'] = dt.timetuple() + entry['title'] = feed_item.get('title') + entry['summary'] = feed_item.get('summary') + entries.append(entry) + if dt: + latest = max(latest or dt, dt) + feed.entries = entries + feed.t_content = latest + return feed + + +def parse_xml_feed(resp) -> Union[Feed, ResourceError]: + """ + Parse a response from Fetcher.get_resp() for xml feed information. + """ + feed = Feed() + feed.url = resp['redirects'][-1] + feed.etag = resp['headers'].get('ETag') + feed.modified = resp['headers'].get('Last-Modified') + feed.t_visit = datetime.utcnow() + try: + parsed = parse(resp['content'], response_headers=resp['headers']) + except Exception as error: + return ResourceError(f'Feedparser error: {error}') + latest = parsed['feed'].get('updated_parsed') + if latest: + latest = datetime(*latest[:6]) + feed.t_content = max(feed.t_content or latest, latest) + feed.version = parsed['version'] + feed.title = parsed['feed'].get('title', '')[:200] or None + feed.description = parsed['feed'].get('description') + feed.fail_count = 0 + feed.entries = parsed['entries'] + return feed + + +def convert_feed_entries( + base_url: Optional[str], + entries: list[dict], +) -> tuple[ + list[tuple[str, bool]], + dict[str, tuple[Optional[str], Optional[str], Optional[str]]], +]: + """ + Extract paths and resource meta information from a feed's entries. + + Return paths in a structure wanted by :func:`add_site_paths` and + resource meta information in a structure wanted by + :func:`update_resource_meta`. + """ + paths = [] + resource_meta = {} + for entry in entries: + if entry.get('link') and entry['link'].startswith(base_url or ''): + path = entry['link'].removeprefix(base_url or '').lstrip('/') + if len(path) <= 200: + last_update = entry.get('published_parsed') + if last_update: + last_update = datetime(*last_update[:6]) + paths.append((path, True)) + resource_meta[path] = ( + last_update, + entry.get('title', '')[:200] or None, + entry.get('summary', '')[:2000] or None, + ) + return paths, resource_meta diff --git a/src/atextcrawler/resource/fetch.py b/src/atextcrawler/resource/fetch.py new file mode 100644 index 0000000..f1f0f8f --- /dev/null +++ b/src/atextcrawler/resource/fetch.py @@ -0,0 +1,327 @@ +""" +Access to a resource specified by a URL. +""" + +import gzip +import logging +from json import loads +from traceback import format_exc +from typing import Any, Optional, Union + +import aiohttp +from bs4 import BeautifulSoup + +from ..models import ( + Feed, + MetaResource, + ResourceError, + ResourceRedirect, + Site, + TextResource, +) +from ..utils.durl import Durl +from ..utils.link import in_blacklist +from .document import parse_document +from .feed import parse_json_feed, parse_xml_feed +from .page import parse_html +from .plaintext import parse_plaintext +from .sitemap import parse_sitemap, parse_sitemapindex + +logger = logging.getLogger(__name__) + + +MAX_REDIRECTS = 10 +""" +Maximum number of redirects to follow. +""" + + +default_headers = { + 'User-Agent': 'Mozilla/5.0 (X11; Linux aarch64; rv:78.0)' + ' Gecko/20100101 Firefox/78.0', + 'DNT': '1', + 'Upgrade-Insecure-Requests': '1', + 'Accept-Language': 'en-US,en;q=0.5, *;q=0.5', +} +""" +Default HTTP client headers, overwriting those of aiohttp.ClientSession. +""" + + +blacklist_content_types = [ + '', + 'application/ogg', +] +""" +Blacklist for content-types. +""" + + +text_content_types = { + 'text/html': 'html', + 'text/plain': 'plain', + 'application/rss+xml': 'feed-rss', + 'application/atom+xml': 'feed-atom', + 'application/feed+json': 'feed-json', + 'application/json': 'json', + 'application/xml': 'xml', + 'text/xml': 'xml', +} +""" +Map content-types to parsers. +""" + + +class ResourceFetcher: + """ + Fetch a resource specified by a URL (:meth:`fetch`). + + The timeout is the same for all requests. + """ + + def __init__( + self, + session: aiohttp.ClientSession, + timeout_sock_connect: Union[int, float] = 8, + timeout_sock_read: Union[int, float] = 30, + ): + self.session = session + self.timeout = aiohttp.ClientTimeout( + sock_connect=timeout_sock_connect, sock_read=timeout_sock_read + ) + + async def fetch( + self, + url: str, + site: Optional[Site] = None, + redirect_history: Optional[list[str]] = None, + headers: Optional[dict] = None, + ) -> Union[ + None, MetaResource, TextResource, ResourceError, ResourceRedirect + ]: + """ + Try to fetch a resource and return an instance or error or redirect. + + If an error was encountered, return a ResourceError. + If the resource has an irrelevant content type, return None. + Otherwise return a specific content instance. + + Argument *redirect_history* contains the redirect history; + if one of the redirects is encountered again, return None. + """ + if redirect_history is None: + redirect_history = [] + if not (durl := await Durl(url)): + return ResourceError('Invalid URL') + resp = await self.get_resp( + durl, + redirect_history=redirect_history, + headers=headers, + ) + if isinstance(resp, ResourceError): + return resp + if resp is None: + return None + result = await self._parse(durl, site, resp) + if isinstance(result, (MetaResource, TextResource)): + result.id_ = None + return result + + async def _parse( + self, durl, site, resp, in_recursion=False + ) -> Union[ + None, MetaResource, TextResource, ResourceError, ResourceRedirect + ]: + """ + Parse a response. May call itself. + """ + result: Union[ + None, MetaResource, TextResource, ResourceError, ResourceRedirect + ] = None + content = resp['content'] + if isinstance(content, str) and content.startswith(' Optional[Union[ResourceError, dict]]: + """ + Try to fetch a url returning a ResourceError or a dict with content. + + Optional *headers* will overwrite the :var:`default_headers`. + + If the response status is not 200, always return an ResourceError. + + If the content-type is not relevant (see blacklist_content_types), + return None. + + The dict contains these keys+values: + + * 'parser': a hint on the parser to use for analyzing the content; + one of 'html', 'plain', 'feed', 'xml', 'application' + * 'content': bytes for type application, otherwise str + * 'redirects': a list of URLs visited during HTTP redirection, + the last item is the final URL + * 'headers': response headers + """ + if redirect_history is None: + redirect_history = [] + if len(redirect_history) >= MAX_REDIRECTS: + return None + headers_ = default_headers.copy() + if headers: + headers_.update(headers) + try: + async with self.session.get( + durl.url(), + headers=headers_, + timeout=self.timeout, + ) as resp: + redirects = [durl.url()] + if resp.history: + href = resp.history[-1].headers.get('location') + if not href or not (redurl := await Durl(href, base=durl)): + msg = 'Invalid URL after HTTP redirect' + return ResourceError(msg) + if in_blacklist(redurl.hostname): + src_url = ( + redirect_history[0] + if redirect_history + else durl.url() + ) + msg = ( + f'Dropping URL {src_url}, since' + f' redirected to a blacklisted site' + ) + logger.debug(msg) + return None + redirects = [str(r.url) for r in resp.history] + redirects.append(redurl.url()) + if join := set(redirect_history) & set(redirects): + msg = f'Cyclic redirect {join}' + return ResourceError(msg) + if resp.status != 200: + msg = f'HTTP status {resp.status}' + return ResourceError( + msg, status=resp.status, headers=headers + ) + c_type = resp.headers.get('content-type', '').split(';')[0] + if c_type in blacklist_content_types: + return None + result: dict[str, Any] = { + 'redirects': redirect_history + redirects, + 'headers': resp.headers, + } + if c_type in text_content_types.keys(): + try: # catch decoding issues + content = await resp.text() + except: + body = await resp.read() + encoding = resp.charset or 'utf-8' + encoding = encoding.replace('CP-1250', 'cp1250') + content = body.decode(encoding, errors='replace') + result['content'] = content + result['parser'] = text_content_types[c_type] + return result + elif c_type.startswith('application/'): + result['content'] = await resp.read() + result['parser'] = 'application' + return result + except aiohttp.ClientError as error: + # on certificate error try without tls + if 'SSLCertVerificationError' in str(error): + if durl.scheme == 'https': + url = durl.url() + durl.replace_scheme('http') + response = await self.get_resp( + durl=durl, + headers=headers, + redirect_history=redirect_history + [url], + ) + if not isinstance(response, ResourceError): + return response + msg = f'ClientError: {error}' + return ResourceError(msg) + except Exception as error: + msg = f'Unknown error: {error}:\n{format_exc()}' + logger.error(msg) + return ResourceError(msg) + return None + + +async def parse_xml( + durl: Durl, + response: dict, + rss=False, + atom=False, +) -> Optional[Union[MetaResource, ResourceError]]: + """ + Parse XML content. + + In particular, parse sitemapindex, sitemap, RSS feed, atom feed. + """ + try: + xml = response['content'] + soup = BeautifulSoup(xml, 'html.parser') + except: + return None + if rss or (rss := soup.find('rss')): + return parse_xml_feed(response) + elif atom or (atom := soup.find('atom')): + return parse_xml_feed(response) + elif sitemapindex := soup.find('sitemapindex'): + return parse_sitemapindex(sitemapindex) + elif urlset := soup.find('urlset'): + return parse_sitemap(urlset) + else: + return None + + +async def parse_json( + durl: Durl, + response: dict, +) -> Optional[Union[Feed, ResourceError]]: + """ + Parse the content of JSON feeds. + """ + try: + data = loads(response['content']) + except: + msg = f'Could not parse JSON from {durl.url()}' + logger.debug(msg) + return None + if not isinstance(data, dict): + return None + if data.get('version', '').startswith('https://jsonfeed.org/'): + return parse_json_feed(response, data) + return None diff --git a/src/atextcrawler/resource/operations.py b/src/atextcrawler/resource/operations.py new file mode 100644 index 0000000..dffe2bc --- /dev/null +++ b/src/atextcrawler/resource/operations.py @@ -0,0 +1,347 @@ +""" +Operations on resources. +""" + +import logging +from datetime import datetime +from typing import Optional, Sequence + +from asyncpg import Connection + +from ..models import ( + Feed, + MetaResource, + ResourceError, + Site, + Sitemap, + SitemapIndex, + SitePath, + TextResource, +) +from ..search import delete_resource, index_resource +from ..tensorflow import TensorFlow +from ..utils.durl import Durl +from ..utils.similarity import ( + create_simhash, + search_simhash, + simhash_from_bigint, + simhash_to_bigint, +) +from .feed import convert_feed_entries +from .fetch import ResourceFetcher +from .sitemap import extract_sitemap_paths + +logger = logging.getLogger(__name__) + + +async def add_site_paths( + conn: Connection, + site_id: int, + paths: Sequence[tuple[str, Optional[bool]]], +) -> None: + """ + Add site paths. if resource infos are given, also create resources. + + The paths must be given as relative paths and together with a boolean + telling whether the link is a canonical link. + """ + sql = ( + "INSERT INTO site_path (site_id, path, canonical)" + " VALUES ($1, $2, $3) ON CONFLICT (site_id, path) DO NOTHING" + ) + values = ( + (site_id, path, canonical) + for path, canonical in paths[:100000] + if len(path) <= 400 + ) + await conn.executemany(sql, values) + + +async def update_resource_meta( + conn: Connection, + site_id: int, + resource_meta: dict, +) -> None: + """ + Update meta information of existing resources using path to find them. + """ + sql = ( + "UPDATE resource SET last_change=coalesce($1, last_change)," + " title=coalesce($2, title), summary=coalesce($3, summary) FROM (" + " SELECT resource_id FROM site_path WHERE site_id=$4 AND path=$5" + ") sp WHERE resource.id=sp.resource_id" + ) + values = ((*meta, site_id, path) for path, meta in resource_meta.items()) + await conn.executemany(sql, values) + + +async def store_feed_entries( + conn: Connection, + site: Site, + entries: list[dict], +) -> None: + """ + Add missing resources of a site from given feed entries. + """ + if site.id_: + paths, resource_meta = convert_feed_entries(site.base_url, entries) + await add_site_paths(conn, site.id_, paths) + await update_resource_meta(conn, site.id_, resource_meta) + + +async def get_site_path( + conn: Connection, + site: Site, + before: datetime, + only_new=False, +) -> Optional[SitePath]: + """ + Return the next path of a given site that needs to be processed. + + If none needs to be processed, return None. + + Only return paths that have last been visited before *before* + or not been processed at all. Paths with a ok_count of -3 or lower + are dropped. + + If *only_new*, limit to paths that have not been processed at all, + irrespective of the value of *before*. + """ + if only_new: + sql = ( + "SELECT * FROM site_path" + " WHERE site_id=$1 AND last_visit is null LIMIT 1" + ) # implicitly canonical=null + row = await conn.fetchrow(sql, site.id_) + else: + sql = ( + "SELECT * FROM site_path" + " WHERE site_id=$1 AND canonical IS NOT false AND" + " (last_visit is null OR last_visit<$2) AND" + " ok_count > -3 LIMIT 1" + ) # canonical can be true or null + row = await conn.fetchrow(sql, site.id_, before) + if row: + return await SitePath().load_from_row(row) + return None + + +async def process_site_path( + app, + worker_number: int, + conn: Connection, + fetcher: ResourceFetcher, + tf: TensorFlow, + site: Site, + site_path: SitePath, +) -> bool: + """ + Fetch a path, deduplicate and if canonical, update and index the resource. + + Return whether a new resource was handled that should contribute be + statistics. + """ + msg = ( + f'Worker {worker_number} processing site {site.id_}' + f' site_path {site_path.id_} {site.base_url}{site_path.path}' + ) + logger.debug(msg) + if not site.id_: # only to satisfy typing + return False + + # fetch url + site_path.last_visit = datetime.utcnow() + url = site_path.url(site) + resource = await fetcher.fetch(url, site=site) + + # handle failure (possibly deleting old information) + if not isinstance(resource, (TextResource, MetaResource)): + if not resource: # irrelevant content-type + site_path.ok_count = -10 + elif isinstance(resource, ResourceError): + site_path.ok_count -= 1 + if site_path.ok_count <= -3 and site_path.resource_id: + await site_path.unlink_resource( + conn, + app.search_engine, + app.config['elasticsearch']['index_base_name'], + ) + await site_path.save(conn) + if resource: # relevant content-type + msg = ( + f'Worker {worker_number} failed to process site_path' + f' {site_path.id_} (site {site.id_},' + f' {site.base_url}{site_path.path})' + ) + logger.info(msg) + return False + + # handle MetaResources + if isinstance(resource, MetaResource): + if isinstance(resource, Feed): + resource.site_id = site.id_ + await resource.save(conn) + if resource.entries: + await store_feed_entries(conn, site, resource.entries) + elif isinstance(resource, Sitemap): + paths, _ = extract_sitemap_paths(site.base_url, resource.urls) + await add_site_paths(conn, site.id_, paths) + elif isinstance(resource, SitemapIndex): + for sitemap_dict in resource.sitemaps: + url = sitemap_dict['loc'] + res_sitemap = await fetcher.fetch(url, site=site) + if isinstance(res_sitemap, Sitemap): + paths, _ = extract_sitemap_paths( + site.base_url, res_sitemap.urls + ) + await add_site_paths(conn, site.id_, paths) + return False + + # handle TextResource + relevant, is_new_resource = await _handle_text_resource( + app, conn, tf, site, site_path, resource, url + ) + if not relevant: + return False + site_path.resource_id = resource.id_ + site_path.canonical = resource.init_fields.get('canonical') + site_path.ok_count += 1 + await site_path.save(conn) + + if shortlink_url := resource.init_fields.get('shortlink'): + await _save_shortlink( + conn, site, url, resource, shortlink_url, site_path.last_visit + ) + + return is_new_resource + + +async def _handle_text_resource( + app, conn, tf, site, site_path, resource, url +) -> tuple[bool, bool]: + """ + Ingest a text resource. + + Return whether the resource is relevant and whether it is new. + """ + # save the resource's internal links + paths = [] + if links_int := resource.init_fields['links_int']: + for durl, (rel, _) in links_int.items(): + rp_filter = app.plugins['filter_resource_path'].rp_filter + if path := rp_filter(site, durl): + canon = (rel and rel.lower() == 'canonical') or None + paths.append((path, canon)) + await add_site_paths(conn, site.id_, paths) + + # find resources similar to the current text + text = resource.search_fields['text'] + if len(text) < 300: # discard resources with too short texts + site_path.resource_id = None + await site_path.save(conn) + return False, False + simhash = simhash_from_bigint(resource.simhash) + index = site.simhash_index + similar_ids = search_simhash(index, simhash) + + # determine the destination resource and resources to be merged into it + old_id = site_path.resource_id + if ( + old_id + and old_id in similar_ids + and ( # similar to old text + dest_resource := await TextResource().load(conn, old_id) + ) + ): + merge_ids = list(filter(lambda elem: elem != old_id, similar_ids)) + else: # no old text, or old text not similar any more + if old_id: + await site_path.unlink_resource( + conn, + app.search_engine, + app.config['elasticsearch']['index_base_name'], + ) + # find the first existing similar resource + for similar_id in similar_ids: + dest_resource = await TextResource().load(conn, similar_id) + if dest_resource: + # also require similar length + l1 = len(resource.search_fields['text']) + l2 = dest_resource.text_len + if 0.95 * l2 <= l1 <= 1.05 * l2: + merge_ids = list( + filter(lambda elem: elem != similar_id, similar_ids) + ) + break + else: + dest_resource = None + merge_ids = [] + + # update or create the destination resource + if dest_resource: + is_new_resource = False + resource.simhash = create_simhash(index, dest_resource.id_, simhash) + await dest_resource.update_from_resource(resource) + resource = dest_resource + else: + is_new_resource = True + resource.simhash = simhash_to_bigint(simhash) + await resource.save(conn) + create_simhash(index, resource.id_, simhash) + + # add resource to search index + if resource.content_type in ('html', 'plain'): + await index_resource( + app.search_engine, + tf, + site_path, + resource, + site.base_url, + url, + ) + + # merge resources: merge_ids -> resource + for merge_id in merge_ids: + # replace links to the merge resource with links to the dest resource + sql = "UPDATE site_path SET resource_id=$1 WHERE resource_id=$2" + await conn.execute(sql, resource.id_ or None, merge_id) + # remove orphaned merge resource + sql = "DELETE FROM resource WHERE id=$1 RETURNING (true, lang)" + found = await conn.fetchval(sql, merge_id) + if found: + await delete_resource( + app.search_engine, + found[1], + merge_id, + ) + + return True, is_new_resource + + +async def _save_shortlink( + conn, site, url, resource, shortlink_url, last_visit +): + """ + Save a shortlink. + """ + shortlink_durl = await Durl(shortlink_url, base=site.base_url) + if shortlink_durl and shortlink_url != url: + sql = "SELECT * FROM site_path WHERE site_id=$1 AND path=$2" + sl_path = shortlink_durl.pwa() + row = await conn.fetchrow(sql, site.id_, sl_path) + shortlink = await SitePath().load_from_row(row) + if not shortlink: + shortlink = SitePath( + site_id=site.id_, + path=sl_path, + last_visit=last_visit, + ok_count=1, + canonical=False, + resource_id=resource.id_, + ) + else: + shortlink.last_visit = last_visit + shortlink.ok_count += 1 + shortlink.canonical = False + shortlink.resource_id = resource.id_ + await shortlink.save(conn) diff --git a/src/atextcrawler/resource/page.py b/src/atextcrawler/resource/page.py new file mode 100644 index 0000000..540a023 --- /dev/null +++ b/src/atextcrawler/resource/page.py @@ -0,0 +1,355 @@ +""" +Parse HTML pages. +""" + +import logging +from copy import deepcopy +from typing import Optional, Union + +from bs4 import BeautifulSoup +from tidylib import tidy_document + +from ..models import ResourceError, ResourceRedirect, Site, TextResource +from ..utils.annotation import ( + annotate, + annotations_remove_section, + clean_annotations, + get_tag_counts, + headline_probability, +) +from ..utils.date_finder import extract_latest_date +from ..utils.durl import Durl, assort_links +from ..utils.html import ( + clean_body, + clean_page, + extract_title, + get_html_lang, + get_html_redirect, +) +from ..utils.http import get_header_links +from ..utils.lang import extract_content_language +from ..utils.section import iter_sections +from ..utils.tag import keep_tags + +logger = logging.getLogger(__name__) +logger_debug = logging.getLogger(__name__ + '.debug') +logger_debug.setLevel(logging.INFO) +logger_links = logging.getLogger(__name__ + '.debug.links') +logger_stats = logging.getLogger(__name__ + '.debug.stats') +logger_sections = logging.getLogger(__name__ + '.debug.sections') + + +async def parse_html( + durl: Durl, + resp: dict, + site: Optional[Site], +) -> Optional[Union[TextResource, ResourceError, ResourceRedirect]]: + """ + Extract relevant data from a response returning a TextResource instance. + + The given URL must be the full URL (incl. scheme and netloc) of the page. + """ + html = resp['content'] + + # follow link to canonical URL + header_links = await get_header_links(resp['headers'], durl, site) + if canonical := header_links.get('canonical'): + if canonical != durl.url(): + return ResourceRedirect(resp['redirects'] + [canonical]) + + # follow html redirect, if present + if redir_url := get_html_redirect(html): + if redir_url not in resp['redirects']: + return ResourceRedirect(resp['redirects'] + [redir_url]) + else: + msg = f'Cyclic HTML redirect: {redir_url} in {resp["redirects"]}' + return ResourceError(msg) + + # require html tag + if not html[:14].lower().startswith(' 0.5: + w = -n_links + elif link_density > 0.3 and avg_text_len < 60: + w = -3 + else: + n_li, li_density, li_len = get_tag_counts( + ('li',), i, f, tags, text + ) + if link_density > 0.2 and li_density > 0.8 and li_len < 50: + w = -3 + if 52 <= lvl < 60: + w = max(w, 1.0) + if 'sidebar' in ' '.join(section_ids.get(i, [])): + w = -3 + if len(txt) < 20 and ('RSS' in txt or 'MENU' in txt): + w = -3 + # special chars + if txt.startswith('←') or txt.endswith('→'): # wordpress navigation + w = -3 + # remove boilerplate texts + if boilerplate_texts and txt in boilerplate_texts: + w = -10 + sections_keep[(i, f)] = w, lvl + + # amend keep scores: look at preceding / subsequent sections with + # equal level and transfer their keep scores to the current section + n = len(sections_keep) + sections = list(sorted(sections_keep.keys())) + # inspect subsequent sections: + for rev_ind, s_range in enumerate(reversed(sections)): + ind = n - 1 - rev_ind + w, lvl = sections_keep[s_range] + if abs(w) <= 2: + w_sum = 0 + n_peers = 0 + for i in range(ind + 1, min(n, ind + 15)): + w_, lvl_ = sections_keep[sections[i]] + if lvl_ != lvl: + break + n_peers += 1 + w_sum += w_ + if n_peers >= 3: + sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl + # inspect preceding sections: + for ind, s_range in enumerate(sections): + w, lvl = sections_keep[s_range] + if abs(w) <= 2: + w_sum = 0 + n_peers = 0 + for i in range(ind - 1, max(0, ind - 15), -1): + w_, lvl_ = sections_keep[sections[i]] + if lvl_ != lvl: + break + n_peers += 1 + w_sum += w_ + if n_peers >= 3: + sections_keep[s_range] = w + 2 * w_sum / n_peers, lvl + + # amend keep scores: look at sections that could be headlines + # for subsequent kept sections and increase their score; + # also allow for up to 2 sections inbetween (which will also + # have their score increased) + for rev_ind, s_range in enumerate(reversed(sections)): + ind = n - 1 - rev_ind + w, lvl = sections_keep[s_range] + if abs(w) <= 2: + if headline_probs.get(s_range, 0) > 0.49: + # look at subsequent sections with higher level + child_weights = [] + for i in range(ind + 1, n): + w_, lvl_ = sections_keep[sections[i]] + if lvl_ <= lvl or w_ < -2: + break + child_weights.append(w_) + if nc := len(child_weights): + child_avg = sum(child_weights) / nc + if w + 1.2 * child_avg > 2: + sections_keep[s_range] = w + 1.2 * child_avg, lvl + if nc > 1: + if (w1 := child_weights[0]) <= 2: + sections_keep[sections[ind + 1]] = ( + w1 + 1.5 * child_avg, + lvl, + ) + if nc > 2: + if (w2 := child_weights[1]) <= 2: + sections_keep[sections[ind + 2]] = ( + w2 + 2 * child_avg, + lvl, + ) + + # clean annotations + clean_annotations(annotations) + + # debug sections + if logger_sections.isEnabledFor(logging.DEBUG): + logger_sections.debug('============= Weighted sections =============') + for i, f, lvl, txt in iter_sections(text, sb, max_level=60): + w, lvl = sections_keep[(i, f)] + indent = ('+' if w > 2 else '-') * lvl + ts = ','.join(tags[(i + 1, f)]) + logger_sections.debug(f'{indent} {i} {f} {ts} {txt} {w:.2f}') + + # narrow down annotations and text to keep_sections + # drop undecided sections + filtered_text = text + filtered_ann = deepcopy(annotations) + for i, f in sorted(sections_keep.keys(), reverse=True): + w, lvl = sections_keep[(i, f)] + if w <= 2.0: + filtered_ann = annotations_remove_section(filtered_ann, i, f) + filtered_text = filtered_text[:i] + filtered_text[f:] + clean_annotations(filtered_ann) + + # debug filtered sections + if logger_sections.isEnabledFor(logging.DEBUG): + logger_sections.debug('') + logger_sections.debug('============= Filtered sections =============') + fsb = filtered_ann['semantic_breaks'] + ftags = filtered_ann['tags'] + for i, f, lvl, txt in iter_sections(filtered_text, fsb, max_level=100): + indent = ' ' * lvl + ts = ','.join(ftags.get((i + 1, f), [])) + logger_sections.debug(f'{indent} {lvl} {i} {f} {ts} {txt}') + + return filtered_text, filtered_ann diff --git a/src/atextcrawler/resource/plaintext.py b/src/atextcrawler/resource/plaintext.py new file mode 100644 index 0000000..1514508 --- /dev/null +++ b/src/atextcrawler/resource/plaintext.py @@ -0,0 +1,148 @@ +""" +Parse plaintext pages. +""" + +import logging +import re +from typing import Any, Optional, Union + +import pypandoc + +from ..models import ResourceError, ResourceRedirect, Site, TextResource +from ..utils.annotation import annotate +from ..utils.date_finder import extract_latest_date +from ..utils.durl import Durl +from ..utils.http import get_header_links +from ..utils.lang import extract_content_language +from ..utils.muse import parse_muse + +logger = logging.getLogger(__name__) + + +MAX_LINK_TEXT_LENGTH = 100 +""" +Maximum length of a link's text to be kept. + +Cf. table site_link, column link_text. +""" + + +re_url = re.compile( + r'((http|https)://[\w_-]+\.[\w_-]+(:[0-9]+)?' + r'([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?)' +) + + +re_nl = re.compile(r'\r\n') + + +re_ws = re.compile(r'\s*\n\s*\n\s*') + + +re_nn = re.compile(r'\n\n') + + +async def parse_plaintext( + durl: Durl, + resp: dict, + site: Optional[Site], +) -> Optional[Union[ResourceRedirect, TextResource]]: + """ + Extract relevant data from a response returning a TextResource instance. + + The given URL must be the full URL (incl. scheme and netloc) of the page. + """ + text = resp['content'] + + # HTTP headers, canonical URL, shortlink + header_links = await get_header_links(resp['headers'], durl, site) + if canonical := header_links.get('canonical'): + if canonical != durl.url(): + return ResourceRedirect(resp['redirects'] + [canonical]) + shortlink = header_links.get('shortlink') + + if not text: + return None + + text = re_nl.sub('\n', text) + text = re_ws.sub('\n\n', text) + + # meta info + meta: dict[str, Any] = {} + muse = None + if durl.path.endswith('.muse'): + muse = parse_muse(text) + if muse: + meta, text = muse + # title + if not meta.get('title'): + meta['title'] = text[:200].splitlines()[0] + # content language + if not meta.get('lang'): + meta['lang'] = extract_content_language(text) + # publication date + if not meta.get('pub_date'): + meta['pub_date'] = extract_latest_date(text, lang=meta.get('lang')) + + # links + links_int: dict[Durl, tuple[list[str], str]] = {} + links_ext: dict[Durl, tuple[list[str], str]] = {} + for url in re_url.findall(text): + link_durl = await Durl(url[0]) + if link_durl: + if link_durl.site() == durl.site(): + links_int[link_durl] = [], link_durl.url() + else: + links_ext[link_durl] = [], link_durl.url() + + if muse: + html = pypandoc.convert_text(text, 'html5', format='muse').strip() + text, annotations = annotate(html) + else: + text, annotations = annotate_text(text) + + return TextResource( + content_type=resp['parser'], + last_change=meta.get('pub_date'), + text_len=len(text), + lang=meta.get('lang'), + title=meta.get('title'), + init_fields={ + 'durl': durl, + 'site': site, + 'headers': resp['headers'], + 'redirects': resp['redirects'], + 'links_int': links_int, + 'links_ext': links_ext, + 'shortlink': shortlink, + 'canonical': None, + }, + search_fields={ + 'title': meta.get('title'), + 'authors': meta.get('authors'), + 'pub_date': meta.get('pub_date'), + 'keywords': meta.get('keywords'), + 'summary': meta.get('summary'), + 'text': text, + 'annotations': annotations, + }, + ) + + +def annotate_text(text): + """ + Return annoations as :func:`utils.annotation.annotate`does. + + Here we only have information on semantic breaks + (in plaintext they are where empty lines are). + """ + semantic_breaks = {} + for match in re_nn.finditer(text): + semantic_breaks[match.span()[0]] = '' + annotations = { + 'tags': {}, + 'semantic_breaks': semantic_breaks, + 'section_ids': {}, + 'links': {}, + } + return text, annotations diff --git a/src/atextcrawler/resource/sitemap.py b/src/atextcrawler/resource/sitemap.py new file mode 100644 index 0000000..e1b06aa --- /dev/null +++ b/src/atextcrawler/resource/sitemap.py @@ -0,0 +1,149 @@ +""" +Sitemap and SitemapIndex and related operations. +""" + +import logging +from datetime import datetime +from typing import Optional + +import pytz + +from ..models import Sitemap, SitemapIndex, TextResource + +logger = logging.getLogger(__name__) + + +async def get_sitemap_urls( + fetcher, + base_url: Optional[str], + sitemaps=None, +) -> list[dict]: + """ + Try to find sitemaps and fetch and return their URL content. + + Each sitemapped URL is a dict with key 'loc' and optional key 'lastmod'. + """ + if sitemaps: + # test example: https://www.berlin.de/ + check_all = True + elif base_url: + sitemaps = [ + base_url.rstrip('/') + '/sitemap.xml', + base_url.rstrip('/') + '/wp-sitemap.xml', + base_url.rstrip('/') + '/sitemap_index.xml', + base_url.rstrip('/') + '/sitemap.xml.gz', + base_url.rstrip('/') + '/sitemap_index.xml.gz', + base_url.rstrip('/') + '/sitemap.txt', + base_url.rstrip('/') + '/sitemap/', + base_url.rstrip('/') + '/sitemap1.xml', + base_url.rstrip('/') + '/sitemap-index.xml', + base_url.rstrip('/') + '/sitemapindex.xml', + base_url.rstrip('/') + '/sitemap/index.xml', + ] + check_all = False + else: + return [] + urls = [] + for sitemap in sitemaps: + resource = await fetcher.fetch(sitemap) + found = True + if isinstance(resource, SitemapIndex): + for sitemap_ in resource.sitemaps: + sitemaps.append(sitemap_['loc']) + elif isinstance(resource, Sitemap): + urls += resource.urls + elif isinstance(resource, TextResource) and resource.content_type in ( + 'html', + 'plain', + ): + urls += [ + {'loc': durl.url()} + for durl in resource.init_fields['links_int'] + ] + else: + found = False + if found and not check_all: + break + return urls + + +def parse_sitemapindex(sitemapindex): + """ + Parse a sitemap index returning a `SitemapIndex` with found sitemaps. + """ + sitemaps = [] + for tag in sitemapindex.find_all('sitemap'): + if loc := tag.find('loc'): + if loc.string: + sitemap = {'loc': loc.string.strip()} + if lastmod := tag.find('lastmod'): + try: + t = datetime.fromisoformat(lastmod.string.strip()) + sitemap['lastmod'] = t + except: + pass + sitemaps.append(sitemap) + return SitemapIndex(sitemaps=sitemaps) + + +def parse_sitemap(urlset) -> Sitemap: + """ + Return a list of sitemap URLs. + + Each URL is a dict with these keys+values: + + * loc: the full URL of a mapped resource + * lastmod: optional datetime of its last modification + * changefreq: optional info on the change frequency to be expected + * priority: optional info on its priority relative to other resources + + Cf. https://www.sitemaps.org/protocol.html + """ + urls = [] + for tag in urlset.find_all('url'): + if loc := tag.find('loc'): + if loc.string: + url = {'loc': loc.string.strip()} + if lastmod := tag.find('lastmod'): + try: + t = lastmod.string.strip().rstrip('Z') + url['lastmod'] = ( + datetime.fromisoformat(t) + .astimezone(pytz.utc) + .replace(tzinfo=None) + ) + except: + pass + if changefreq := tag.find('changefreq'): + url['changefreq'] = changefreq.string.strip() + if priority := tag.find('priority'): + url['priority'] = priority.string.strip() + urls.append(url) + return Sitemap(urls=urls) + + +def extract_sitemap_paths( + base_url: Optional[str], + urls: list[dict], +) -> tuple[list[tuple[str, bool]], Optional[datetime]]: + """ + Extract essential information from sitemap URLs. + + Return a list of relative paths of the site's resources + (in a form to be easily fed into `add_site_paths`) and + the datetime of the latest change. + + Relative paths are computed using base_url. + """ + paths = [] + latest = None + for url in urls: + loc = url['loc'] + lastmod = url.get('lastmod') + if loc.startswith(base_url or ''): + path = loc.removeprefix(base_url or '').lstrip('/') + path = path.split('#', 1)[0] + paths.append((path, True)) + if lastmod: + latest = max(lastmod, latest or lastmod) + return paths, latest diff --git a/src/atextcrawler/search/__init__.py b/src/atextcrawler/search/__init__.py new file mode 100644 index 0000000..b9a3bba --- /dev/null +++ b/src/atextcrawler/search/__init__.py @@ -0,0 +1,6 @@ +from .engine import ( + delete_resource, + index_resource, + shutdown_engine, + startup_engine, +) diff --git a/src/atextcrawler/search/engine.py b/src/atextcrawler/search/engine.py new file mode 100644 index 0000000..7a72de6 --- /dev/null +++ b/src/atextcrawler/search/engine.py @@ -0,0 +1,270 @@ +""" +Search engine, for now elasticsearch. + +We have one index per supported language and a default one. +""" + +import logging +import warnings +from difflib import SequenceMatcher +from typing import Union + +from elasticsearch import AsyncElasticsearch +from elasticsearch.exceptions import NotFoundError + +from ..utils.annotation import pack_annotations +from ..utils.section import concat_section_texts + +logger = logging.getLogger(__name__) + + +warnings.filterwarnings( + 'ignore', + 'The client is unable to verify that the' + ' server is Elasticsearch due security privileges on the server side', +) + + +MIN_INDEXING_TIMEOUT_SECONDS = 5 + + +language_analyzers = { + 'en': 'english', + 'de': 'german', + #'fr': 'french', + #'el': 'greek', + #'es': 'spanish', + 'default': 'standard', +} + + +properties = { + 'resource_id': {'type': 'long'}, + 'site_id': {'type': 'long'}, + 'url': {'type': 'text'}, + 'base_url': {'type': 'text'}, + 'pub_date': {'type': 'date', 'format': 'yyyy-MM-dd||yyyy-MM||yyyy'}, + 'lang': {'type': 'keyword'}, + 'title': {'type': 'text'}, + 'authors': {'type': 'text'}, + 'summary': {'type': 'text'}, + 'keywords': {'type': 'text'}, + 'collections': {'type': 'keyword'}, + 'time_horizon': {'type': 'keyword'}, + 'orig_source': {'type': 'text'}, + 'topics': {'type': 'text'}, + 'annotations': {'type': 'text', 'index': False}, + 'sections': { + 'type': 'nested', + 'properties': { + 'start_ids': {'type': 'integer'}, + 'end_ids': {'type': 'integer'}, + 'text': {'type': 'text', 'index_options': 'offsets'}, + 'embedding': {'type': 'dense_vector', 'dims': 512}, + }, + }, +} + + +async def startup_engine(config): + """ + Open the search engine for access. + """ + engine = AsyncElasticsearch( + host=config['elasticsearch']['host'], + api_key=( + config['elasticsearch']['id'], + config['elasticsearch']['api_key'], + ), + use_ssl=False, + timeout=20, + ) + engine.index_base_name = config['elasticsearch']['index_base_name'] + await create_indices(engine) + await open_indices(engine) + return engine + + +async def create_indices(engine): + """ + Create indices for all configured langiages. + """ + for lang, analyzer in language_analyzers.items(): + index_name = engine.index_base_name + '_text_' + lang + if not await engine.indices.exists(index=index_name): + await engine.indices.create(index=index_name) + await engine.indices.close(index=index_name) + await engine.indices.put_settings( + index=index_name, + body={ + 'analysis': {'analyzer': {'default': {'type': analyzer}}}, + 'refresh_interval': '60s', + }, + ) + await engine.indices.put_mapping( + index=index_name, + body={'properties': properties}, + ) + + +async def open_indices(engine): + """ + Open indices for all configure languages. + """ + for lang in language_analyzers.keys(): + index_name = engine.index_base_name + '_text_' + lang + await engine.indices.open(index=index_name) + + +async def shutdown_engine(engine): + """ + Close the connection to the search engine. + """ + # await close_indices(engine) + await engine.close() + + +async def close_indices(engine): + """ + Close indices. UNUSED. + """ + for lang in language_analyzers.keys(): + index_name = engine.index_base_name + '_text_' + lang + await engine.indices.close(index=index_name) + + +async def index_resource( + engine, + tf, + site_path, + resource, + base_url, + url, +): + """ + Index a resource. + """ + lang = resource.lang + index_lang = lang if lang in language_analyzers.keys() else 'default' + index_name = engine.index_base_name + '_text_' + index_lang + pub_date = resource.search_fields.get('pub_date') + if pub_date: + pub_date = str(pub_date.date()) + text = resource.search_fields.get('text') + annotations = resource.search_fields.get('annotations') + semantic_breaks = annotations['semantic_breaks'] + sections = [] + for section_ids, txt in concat_section_texts(text, semantic_breaks): + embedding = await tf.embed(txt) + sections.append( + { + 'start_ids': section_ids[0], + 'end_ids': section_ids[-1], + 'text': txt, + 'embedding': embedding, + } + ) + doc = { + 'resource_id': resource.id_, + 'site_id': site_path.site_id, + 'url': url, + 'base_url': base_url, + 'pub_date': pub_date, + 'lang': resource.lang, + 'title': resource.search_fields.get('title'), + 'authors': resource.search_fields.get('authors'), + 'summary': resource.search_fields.get('summary'), + 'keywords': resource.search_fields.get('keywords'), + 'collections': resource.search_fields.get('collections'), + 'time_horizon': resource.search_fields.get('time_horizon'), + 'orig_source': resource.search_fields.get('orig_source'), + 'topics': resource.search_fields.get('topics'), + 'annotations': pack_annotations(annotations), + 'sections': sections, + } + timeout_seconds = max(MIN_INDEXING_TIMEOUT_SECONDS, int(len(text) / 1000)) + await engine.index( + id=resource.id_, + index=index_name, + body=doc, + timeout=f'{timeout_seconds}s', + ) + + +async def delete_resource(engine, lang, resource_id): + """ + Delete a resource. + """ + index_name = engine.index_base_name + '_text_' + (lang or 'default') + try: + await engine.delete(index_name, resource_id) + except NotFoundError: + msg = f'Cannot delete resource from index, not found: {resource_id}' + logger.warning(msg) + + +async def find_duplicate(engine, site_id, resource) -> Union[bool, None, int]: + """ + UNUSED. + + Try to find a duplicate resource with matching site. + + If the search backend query fails, return False. + If no matching resource was found, return None. + If a matching resource was found, return its id. + """ + # get sample texts + text = resource.search_fields['text'] + if not text or len(text) < 100: + return None + # annotations = resource.search_fields['annotations'] + # semantic_breaks = annotations['semantic_breaks'] + # texts = [] + # for _, txt in concat_section_texts(text, semantic_breaks): + # texts.append(txt) + # texts = extract_samples(texts) + + # # search for sample texts + # text_count = len(texts) + # should_min = max(1, int(0.6 * text_count)) + # should = [] + # for text in texts: + # should.append({'match': {'sections.text': text}}) + query = { + 'bool': { + 'must': { + 'nested': { + 'path': 'sections', + 'query': {'match': {'sections.text': text}}, + }, + }, + 'filter': { + 'term': { + 'site_id': site_id, + }, + }, + } + } + fields = [ + 'url', + 'sections.text', + 'site_id', + ] + response = await engine.search( + index=engine.index_base_name + '_text_*', + body={ + 'query': query, + 'fields': fields, + 'from': 0, + 'size': 3, + '_source': False, + }, + ) + if response['timed_out']: + return False + for hit in response.get('hits', {}).get('hits'): + txt = ' '.join(hit['fields']['sections.text']) + similarity = SequenceMatcher(None, text, txt).ratio() + if similarity > 0.99: + return hit['_id'] + return None diff --git a/src/atextcrawler/site/__init__.py b/src/atextcrawler/site/__init__.py new file mode 100644 index 0000000..e2dacd9 --- /dev/null +++ b/src/atextcrawler/site/__init__.py @@ -0,0 +1,9 @@ +""" +Websites. +""" + +from .feeds import fetch_feeds +from .operations import checkin_site, checkout_site, process_site, update_site +from .queue import process_site_queue +from .robots import RobotsInfo +from .seed import load_seeds diff --git a/src/atextcrawler/site/__main__.py b/src/atextcrawler/site/__main__.py new file mode 100644 index 0000000..b40afc1 --- /dev/null +++ b/src/atextcrawler/site/__main__.py @@ -0,0 +1,68 @@ +""" +Tool for analyzing a website. + +Fetch the startpage and output information to console. +Do not change any persistent data. +""" + +import asyncio +import logging +import sys + +import aiohttp + +from ..models import TextResource +from ..resource import ResourceFetcher, extract_sitemap_paths, get_sitemap_urls +from ..site.robots import RobotsInfo +from ..utils.durl import Durl +from .parse import parse_startpage + +logger = logging.getLogger() +logger.setLevel(logging.WARNING) +logger.addHandler(logging.StreamHandler()) + + +async def run(): + """ + Fetch the startpage of a website and show information about it. + + The URL must be given as commandline argument. + """ + base_url = sys.argv[1] + async with aiohttp.ClientSession() as session: + if not (base_durl := await Durl(base_url)): + return + fetcher = ResourceFetcher(session) + resource = await fetcher.fetch(base_url) + logger.warning(repr(resource)) + if ( + isinstance(resource, TextResource) + and resource.content_type == 'html' + ): + site = await parse_startpage(resource) + # site.crawl_enabled = await site_filter(site) + logger.warning(repr(site)) + logger.warning('') + for durl, text in site.links_ext.items(): + logger.warning(f' {durl} {text}') + logger.warning(f'{durl.url()} -------- {text}') + logger.warning('') + logger.warning(f'Redirects: {resource.init_fields["redirects"]}') + logger.warning('') + robots = await RobotsInfo(base_url) + urls = await get_sitemap_urls( + fetcher, base_url, sitemaps=robots.site_maps + ) + paths, latest = extract_sitemap_paths(base_url, urls) + for path in paths: + logger.warning(path) + logger.warning(f'Feeds: {site.feeds}') + logger.warning(latest) + # sample_links = extract_samples(resource.init_fields['links_int']) + # logger.warning(f'************* {sample_links}') + else: + logger.warning('(No text resource or error.)') + + +if __name__ == '__main__': + asyncio.run(run()) diff --git a/src/atextcrawler/site/feeds.py b/src/atextcrawler/site/feeds.py new file mode 100644 index 0000000..da4fbda --- /dev/null +++ b/src/atextcrawler/site/feeds.py @@ -0,0 +1,100 @@ +""" +High-level feed-related stuff. + +See resource.feed for low-level stuff not primarily related to sites. +""" + +from datetime import datetime +from typing import Optional + +from ..models import Feed +from ..resource import store_feed_entries, update_feed + + +async def store_new_feeds(conn, site_id, feeds: dict): + """ + Store new feeds in table site_feed. + """ + sql = "SELECT array_agg(url) FROM site_feed WHERE site_id=$1" + known_feeds = (await conn.fetchval(sql, site_id)) or [] + for feed_url in feeds.keys(): + if feed_url not in known_feeds: + feed = Feed( + site_id=site_id, + url=feed_url, + ) + await feed.save(conn) + + +async def get_feeds(conn, site_id) -> list[Feed]: + """ + Return stored feeds for the given site. + """ + sql = "SELECT * FROM site_feed WHERE site_id=$1" + rows = (await conn.fetch(sql, site_id)) or [] + return [(await Feed().load_from_row(row)) for row in rows] + + +async def fetch_feeds(fetcher, conn, site) -> Optional[datetime]: + """ + Fetch feeds, add new resources and return the latest content update time. + """ + feeds = await get_feeds(conn, site.id_) + latest = None + for feed in feeds: + feed_content = await update_feed(fetcher, feed, conn) + if feed_content: + await store_feed_entries(conn, site, feed_content) + if feed.t_content: + latest = max(latest or feed.t_content, feed.t_content) + return latest + + +if __name__ == '__main__': + # only use this on a dev instance! + import asyncio + import logging + import sys + + import aiohttp + + from ..config import Config + from ..db import PGPool + from ..resource.fetch import ResourceFetcher + from .operations import process_site, update_site + + logger = logging.getLogger() + logger.setLevel(logging.DEBUG) + config = Config().get() + url = sys.argv[1] + + async def run(): + """ + Fetch and display a site. + """ + app = None # TODO + async with PGPool(config['postgresql']) as pool: + async with pool.acquire() as conn: + async with aiohttp.ClientSession() as session: + fetcher = ResourceFetcher(session) + site, _ = await update_site(app, fetcher, conn, url) + logger.warning(site) + await process_site(fetcher, conn, site) + latest = await fetch_feeds(fetcher, conn, site) + logger.warning(f'latest: {latest}') + # feed = Feed(url=url) + # feed_content = await update_feed(fetcher, feed, conn) + # if isinstance(feed_content, ResourceError): + # print(feed_content) + # else: + # print(feed) + # pprint(feed_content[0]) + # print('---- 2nd try ----') + # feed_content = await update_feed(fetcher, feed, conn) + # if isinstance(feed_content, ResourceError): + # print(feed_content) + # else: + # print(feed) + # pprint(feed_content[0]) + + asyncio.run(run()) diff --git a/src/atextcrawler/site/operations.py b/src/atextcrawler/site/operations.py new file mode 100644 index 0000000..36689c5 --- /dev/null +++ b/src/atextcrawler/site/operations.py @@ -0,0 +1,267 @@ +""" +Operations on sites. +""" + +import logging +from datetime import datetime, timedelta +from typing import Optional + +from asyncpg import Connection + +from ..models import Crawl, Site, TextResource +from ..resource import ( + add_site_paths, + extract_sitemap_paths, + get_sitemap_urls, + store_boilerplate_texts, +) +from ..utils.durl import Durl +from ..utils.similarity import get_simhash_index +from .feeds import fetch_feeds, store_new_feeds +from .parse import parse_startpage +from .robots import RobotsInfo + +logger = logging.getLogger(__name__) + + +async def checkout_site( + app, conn: Connection +) -> tuple[Optional[int], bool, bool]: + """ + Get the id of a site to be crawled and mark it with crawl_active=true. + + Also return whether the site shall be fully crawled; if not, this + means that just the resources from the feeds shall be crawled. + + Also return whether more sites might be available. + """ + async with conn.transaction(): + sql = ( + "SELECT id, next_full_crawl < now() at time zone 'UTC' is_full" + " FROM site WHERE crawl_enabled AND crawl_active = false" + " AND (next_full_crawl < now() at time zone 'UTC'" + " OR next_feed_crawl < now() at time zone 'UTC')" + " LIMIT 1 FOR UPDATE SKIP LOCKED" + ) + row = await conn.fetchrow(sql) + if row: + site_id = row['id'] + is_full = row['is_full'] + sql = "UPDATE site SET crawl_active = true WHERE id=$1" + await conn.execute(sql, site_id) + site = await Site().load(conn, site_id) + if site: + site.base_durl = await Durl(site.base_url) + if site.base_durl: + site.simhash_index = await get_simhash_index(conn, site_id) + return site, is_full, True + else: + # site not available; schedule next crawl + int_full = app.config['crawl']['full_crawl_interval'] + int_feed = app.config['crawl']['feed_crawl_interval'] + now = datetime.utcnow() + t_full = now + timedelta(seconds=int_full) + t_feed = now + timedelta(seconds=int_full + int_feed) + sql = ( + "UPDATE site SET crawl_active=false," + " next_full_crawl=$1, next_feed_crawl=$2" + " WHERE id=$3" + ) + await conn.execute(sql, t_full, t_feed, site_id) + return None, False, True + return None, False, True + return None, False, False + + +async def update_site( + app, fetcher, conn: Connection, base_url, site: Site = None +) -> tuple[Optional[Site], bool]: + """ + Try to fetch base_url and return a site and whether a new one was created. + + This function is run for all sites (including blacklisted and irrelevant + ones). It determines whether the site shall be crawled. + + If an errors occurs, return (None, False), and if a site was given, + also set it to crawl_enabled=False and remove crawling schedules. + + If base_url could be fetched, update the site, possibly creating + a new one. + + If the site has crawl_enabled, and no full crawl is scheduled, + schedule one (by updating column `next_full_crawl`). + """ + # fetch startpage + logger.info(f'Updating site={site}, base_url={base_url}') + resource = await fetcher.fetch(base_url, site=site) + if ( + not isinstance(resource, TextResource) + or resource.content_type != 'html' + ): + if site: + site.meta_info['error'] = 'Invalid start page' + site.crawl_enabled = False + site.next_full_crawl = None + site.next_feed_crawl = None + await site.save(conn) + logger.info(f'Failed startpage {base_url}: {resource}') + return None, False + + # parse startpage (extract site information) and save the site + site = await parse_startpage(resource, app=app, site=site) + site_id, created = await site.save(conn) + if created: + logger.debug(f'Created {site}') + + # add black-/white-listing info + is_allowed = await is_site_allowed(conn, site.id_, base_url) + if is_allowed is not None and is_allowed != site.crawl_enabled: + site.crawl_enabled = is_allowed + await site.save(conn) + + # schedule full crawl, if none is scheduled and the site shall be crawled + if site.crawl_enabled: + sql = ( + "UPDATE site" + " SET next_full_crawl=now() at time zone 'UTC'" + " WHERE id=$1 AND next_full_crawl IS null" + ) + await conn.execute(sql, site_id) + + return site, created + + +async def is_site_allowed( + conn: Connection, + site_id: Optional[int], + base_url: str, +) -> Optional[bool]: + """ + Return True if the site is whitelisted, False if blacklisted, else None. + + Also add missing site_ids to the annotations. + """ + sql = "SELECT * FROM site_annotation WHERE site_id=$1 OR base_url=$2" + anns = await conn.fetch(sql, site_id, base_url) + for ann in anns: + if ann['ann_type'] == 'blacklist': + return False + if ann['ann_type'] == 'whitelist': + return True + # add missing site_ids + if site_id and any([ann['site_id'] is None for ann in anns]): + sql = "UPDATE site_annotation SET site_id=$1 WHERE base_url=$2" + await conn.execute(sql, site_id, base_url) + return None + + +async def process_site(fetcher, conn: Connection, site: Site): + """ + Process a site: fetch and store more information. + + Store external and internal links; find boilerplate texts; + fetch sitemaps; fetch feeds; update date of last publication. + """ + if not site.id_: # only to satisfy typing + return + if site.links_ext: + await _store_cross_site_links(conn, site.id_, site.links_ext) + if site.links_int: + paths = [] + for durl, (rel, _) in site.links_int.items(): + canon = (rel and rel.lower() == 'canonical') or None + paths.append((durl.pwa(), canon)) + await add_site_paths(conn, site.id_, paths) + + await store_boilerplate_texts(fetcher, conn, site) + + # get sitemaps and add their resources + robots = await RobotsInfo(site.base_url) # type: ignore + urls = await get_sitemap_urls( + fetcher, site.base_url, sitemaps=robots.site_maps + ) + paths_, latest = extract_sitemap_paths(site.base_url, urls) + await add_site_paths(conn, site.id_, paths_) + + # store feeds and their resources + await store_new_feeds(conn, site.id_, site.feeds) + latest_ = await fetch_feeds(fetcher, conn, site) + if latest_: + latest = max(latest or latest_, latest_) + + # update last_pub + if latest: + site.last_pub = latest + await site.save(conn) + + +async def checkin_site(app, conn: Connection, site: Site, crawl: Crawl): + """ + Unlock the site and schedule next crawl. + + *crawl* is the crawl that has just finished (regularly or stopped). + + If the crawl was stopped (t_end is None), just unlock the site. + + Otherwise schedule a crawl of the same type. After a full crawl + also a feed crawl is scheduled, if there was none scheduled. + """ + if crawl.t_end is None: + sql = "UPDATE site SET crawl_active=false WHERE id=$1" + await conn.execute(sql, site.id_) + elif crawl.is_full: + full_interval = app.config['crawl']['full_crawl_interval'] + feed_interval = app.config['crawl']['feed_crawl_interval'] + next_full_crawl = crawl.t_begin + timedelta(seconds=full_interval) + next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval) + sql = ( + "UPDATE site SET crawl_active=false, next_full_crawl=$1," + " next_feed_crawl=coalesce(next_feed_crawl, $2) WHERE id=$3" + ) + await conn.execute(sql, next_full_crawl, next_feed_crawl, site.id_) + else: + feed_interval = app.config['crawl']['feed_crawl_interval'] + next_feed_crawl = crawl.t_begin + timedelta(seconds=feed_interval) + sql = ( + "UPDATE site SET crawl_active=false, next_feed_crawl=$1" + " WHERE id=$2" + ) + await conn.execute(sql, next_feed_crawl, site.id_) + + +async def _store_cross_site_links( + conn: Connection, + site_id: int, + links: dict[Durl, tuple[list[str], str]], +) -> None: + """ + Put outgoing links into site_link/site_queue for existing/unknown sites. + + Separate outgoing links from *site_id* into two classes: + (a) existing sites (rows in table site) and (b) unknown links. + Add links from class (a) to table site_link. + Add links from class (b) to table site_queue. + """ + # add outgoing cross-site links for existing sites to table site_link + urls = [url.site() for url in links.keys()] + values = [] + sql = "SELECT id, unnest(base_urls) url FROM site WHERE base_urls && $1" + if rows := await conn.fetch(sql, urls): + for row in rows: + if (durl := await Durl(row['url'])) in links.keys(): + _, link_text = links.pop(durl) + if site_id != row['id']: + values.append((site_id, row['id'], link_text)) + sql = ( + "INSERT INTO site_link (src, dst, link_text)" + " VALUES ($1, $2, $3) ON CONFLICT (src, dst) DO NOTHING" + ) + await conn.executemany(sql, values) + + # add outgoing cross-site links for unknown sites to table site_queue + sql = "INSERT INTO site_queue (src, url, link_text) VALUES ($1, $2, $3)" + values = [ + (site_id, durl.site()[:200], link_text[:100]) + for durl, (_, link_text) in links.items() + ] + await conn.executemany(sql, values) diff --git a/src/atextcrawler/site/parse.py b/src/atextcrawler/site/parse.py new file mode 100644 index 0000000..e29b15b --- /dev/null +++ b/src/atextcrawler/site/parse.py @@ -0,0 +1,255 @@ +""" +Parsing of a site's startpage. +""" + +import re +from datetime import datetime +from typing import Any, Optional + +from ..models import Site, TextResource +from ..resource import feed_types +from ..utils.durl import Durl, get_ips +from ..utils.html import clean_html +from ..utils.lang import clean_lang +from ..utils.link import ( + extract_domain, + in_blacklist, + link_rels, + meta_names, + meta_props, +) + +re_meta_keyword_sep = re.compile('[,;\r\n]') + + +def cut_str(s: Optional[str], l: int) -> Optional[str]: + """ + Cut a string *s* to a maximal length *l* from the left. + """ + return s[:l] if s else None + + +async def parse_startpage( + startpage: TextResource, app=None, site=None +) -> Site: + """ + Parse a site's startpage and return a Site instance. + + If a site instance is given, update it. + """ + durl = startpage.init_fields['durl'] + soup = startpage.init_fields['head'] + meta = collect_meta_tags(soup) + meta_links = await collect_meta_links(soup, durl) + links_ext = await collect_external_links(startpage, meta_links) + links_int = startpage.init_fields['links_int'] + langs = extract_languages(startpage, meta, meta_links) + title, description, keywords = extract_meta_texts(startpage, meta) + + # feeds + feeds = meta_links['feeds'] + if 'wordpress' in meta.get('generator', '').lower(): + url = durl.site() + 'feed/' + feeds[url] = 'application/rss+xml' + # TODO later: maybe also probe other possible feed paths 'rss', 'rss/' + + # network params (canonical_url, base_urls, domains) + ips = await get_ips(durl.hostname) + redirects = [] + for redirect in startpage.init_fields['redirects']: + redir_url = await Durl(redirect) + if redir_url: + redirects.append(redir_url.site()) + base_urls = redirects + [durl.url()] + domains = [extract_domain(durl.hostname)] + + if site: # update an existing Site + site.canonical_url = meta_links['canonical_url'] or site.canonical_url + site.base_urls = base_urls + site.domains = domains + site.ips = ips + site.last_update = datetime.utcnow() + site.last_pub = startpage.last_change + site.langs = langs + site.alt_langs = meta_links['alt_langs'] + site.title = title + site.description = description + site.keywords = keywords + site.linkbacks.update(meta_links['linkbacks']) + site.meta_info = meta + site.__post_init__( + base_durl=durl, + feeds=feeds, + links_ext=links_ext, + links_int=links_int, + startpage_text=startpage.search_fields['text'], + ) + else: # create new Site instance + site = Site( + # post_init fields + base_durl=durl, + feeds=feeds, + links_ext=links_ext, + links_int=links_int, + startpage_text=startpage.search_fields['text'], + # dataclass fields + canonical_url=meta_links['canonical_url'], + base_urls=base_urls, + domains=domains, + ips=ips, + last_update=datetime.utcnow(), + last_pub=startpage.last_change, + langs=list(langs), + alt_langs=meta_links['alt_langs'], + title=title, + description=description, + keywords=keywords, + linkbacks=meta_links['linkbacks'], + meta_info=meta, + ) + if site.ips is None and site.url: + site.ips = await get_ips(site.url.hostname) + if app and site.startpage_text: + site_filter = app.plugins['filter_site'].site_filter + site.crawl_enabled = await site_filter(site) + return site + + +def collect_meta_tags(soup): + """ + Collect selected meta tags (meta_names and meta_props) with their values. + """ + meta = {} + for tag in soup.find_all('meta'): + if (name := tag.get('name')) and name in meta_names: + meta[name] = tag.get('content') + if (property := tag.get('property')) in meta_props: + if content := tag.get('content'): + meta[property] = content + if tag.get('http-equiv') == 'content-language': # old html + if content := tag.get('content'): + meta['http_equiv_lang'] = content + return meta + + +async def collect_meta_links(soup, base_durl) -> dict[str, Any]: + """ + Collect link tags with site scope (feeds, linkbacks, canonical, ...). + """ + linkbacks = {} + feeds = {} + alt_langs = {} + canonical_url = None + for tag in soup.find_all('link'): + if not (rels := set(tag.get('rel', []))) or not rels & link_rels: + continue + if not (url := tag.get('href')): + continue + if not (link_durl := await Durl(url, base=base_durl)): + continue + if in_blacklist(link_durl.hostname): + continue + link_url = link_durl.url() + link_type = tag.get('type') + if link_type in feed_types: + feeds[link_url] = link_type + elif 'canonical' in rels: + canonical_url = link_url + elif 'alternate' in rels and (hreflang := tag.get('hreflang')): + if lang := clean_lang(hreflang): + alt_langs[lang] = link_durl.url() + elif 'webmention' in rels: + linkbacks[link_url] = 'webmention' + elif 'pingback' in rels: + linkbacks[link_url] = 'pingback' + if canonical_url: + if canonical_durl := await Durl(canonical_url): + canonical_url = canonical_durl.site() + else: + canonical_url = None + return { + 'feeds': feeds, + 'linkbacks': linkbacks, + 'alt_langs': alt_langs, + 'canonical_url': canonical_url, + } + + +async def collect_external_links(startpage, meta_links) -> dict[str, str]: + """ + Return external links (mapping from URL to link text) from startpage. + + Also add links to alternate language variants of the site. + """ + external_links = startpage.init_fields['links_ext'].copy() + netloc = startpage.init_fields['durl'].netloc + for lang, lang_url in meta_links['alt_langs'].items(): + if netloc not in lang_url: + durl = await Durl(lang_url) + if durl: + external_links[durl] = f'Alternate language: {lang}' + return external_links + + +def extract_meta_texts(page, meta) -> tuple[str, Optional[str], list[str]]: + """ + Extract and return title, description, keywords from a page and meta tags. + """ + title = meta.get('og:site_name') + if not title: + title = page.search_fields['title'] or '' + if meta_title := meta.pop('title', None): + if meta_title.lower() not in title.lower(): + title += ('; ' if title else '') + meta_title + title = cut_str(clean_html(title), 200) + description = cut_str(clean_html(meta.pop('description', None)), 2000) + if meta_keywords := meta.pop('keywords', None): + kws = re_meta_keyword_sep.split(meta_keywords) + keywords = [kw.strip()[:50] for kw in kws if kw.strip()] + if len(keywords) < 2: + keywords = [ + kw.strip()[:50] + for kw in meta_keywords.split(' ') + if kw.strip() + ] + else: + keywords = [] + return title, description, keywords + + +def extract_languages(page, meta, meta_links) -> set[str]: + """ + Extract languages from a page's html tag, meta tags and HTTP headers. + + Also add the language detected in the text content of the page. + + Return a set of ISO 639-1 language codes. + + See also https://www.w3.org/International/questions/qa-http-and-lang and + https://www.w3.org/International/questions/qa-html-language-declarations + """ + languages = set() + if lang := clean_lang(page.lang): + languages.add(lang) + if lang := clean_lang(meta.get('http_equiv_lang')): + languages.add(lang) + if lang := clean_lang(meta.get('dc.language')): + languages.add(lang) + if lang := clean_lang(meta.get('og:locale')): + languages.add(lang) + for lang, lang_url in meta_links['alt_langs'].items(): + if page.init_fields['durl'].netloc in lang_url: + if lng := clean_lang(lang): + languages.add(lng) + lngs = ( + page.init_fields['headers'] + .get('Content-Language', '') + .lower() + .replace(' ', '') + .split(',') + ) + for lng in lngs: + if lang := clean_lang(lng): + languages.add(lang) + languages.add(page.lang) + return languages diff --git a/src/atextcrawler/site/queue.py b/src/atextcrawler/site/queue.py new file mode 100644 index 0000000..1cf77d6 --- /dev/null +++ b/src/atextcrawler/site/queue.py @@ -0,0 +1,127 @@ +""" +Queue of sites. + +When processing a resource, its external links are put into database table +`site_queue`. +The items in `site_queue` are processed in :func:`process_site_queue`. +This is done baseURL by baseURL (see :func:`iter_site_queue`). +While doing this, cross-site links are put into table `site_link`. +""" + +import logging +from typing import AsyncIterator, Optional + +import aiohttp +from asyncpg import Connection + +from ..resource import ResourceFetcher +from .operations import update_site + +logger = logging.getLogger(__name__) + + +async def process_site_queue(app, pool): + """ + Loop over queued sites creating new sites and adding cross-site links. + """ + site_delay = app.config['crawl']['site_delay'] + resource_delay = app.config['crawl']['resource_delay'] + async with pool.acquire() as conn: + async with aiohttp.ClientSession() as session: + fetcher = ResourceFetcher(session) + while app.running: + async for base_url, links_from in iter_site_queue(app, conn): + # get or create site + msg = f'Site queue: updating {base_url}' + logger.debug(msg) + site, created = await update_site( + app, fetcher, conn, base_url + ) + if site: + await store_incoming_site_site_links( + conn, site.id_, links_from + ) + # delete handled queue items + sql = "DELETE FROM site_queue WHERE url=$1" + await conn.execute(sql, base_url) + await app.sleep(resource_delay) + logger.debug( + f'Queued sites exhausted, sleeping' + f' for {site_delay} seconds' + ) + await app.sleep(site_delay) + + +async def iter_site_queue( + app, conn: Connection +) -> AsyncIterator[tuple[str, dict[int, str]]]: + """ + Yield URLs with aggregated link information from site_queue. + + Yield a URL and a dict mapping ids of linking sites to link texts. + """ + site_revisit_interval = app.config['crawl']['site_revisit_interval'] + while app.running: + sql = ( + "SELECT url, array_agg(src) srcs," + " array_agg(link_text) link_texts" + " FROM site_queue GROUP BY url LIMIT 1" + ) + row = await conn.fetchrow(sql) + if row: + base_url = row['url'] + links_from = {} + srcs = row['srcs'] + link_texts = row['link_texts'] + for i in range(len(srcs)): + if src := srcs[i]: + links_from[src] = link_texts[i] + if site_id := await site_recently_updated( + conn, base_url, site_revisit_interval + ): + # just store incoming links and remove the site from the queue + await store_incoming_site_site_links(conn, site_id, links_from) + sql = "DELETE FROM site_queue WHERE url=$1" + await conn.execute(sql, base_url) + else: + yield base_url, links_from + else: + break + + +async def site_recently_updated( + conn: Connection, + base_url: str, + site_revisit_interval: float, +) -> Optional[int]: + """ + Return the id of the site with given base_url if it was updated recently. + """ + sql = ( + f"SELECT id FROM site WHERE $1=any(base_urls)" + f" AND last_update + interval '{site_revisit_interval} seconds'" + f" > now() at time zone 'utc' LIMIT 1" + ) + site_id = await conn.fetchval(sql, base_url) + return site_id + + +async def store_incoming_site_site_links( + conn: Connection, site_id: int, links_from: dict +): + """ + Store incoming site-site links (irrespective of crawl_enabled). + + *site_id* is the id of the site to which the links in *links_from* point. + """ + sql = ( + "INSERT INTO site_link" + " (src, dst, link_text) VALUES ($1, $2, $3)" + " ON CONFLICT (src, dst) DO NOTHING" + ) + values = [ + (from_id, site_id, link_text) + for from_id, link_text in links_from.items() + if from_id != site_id + ] + await conn.executemany(sql, values) diff --git a/src/atextcrawler/site/robots.py b/src/atextcrawler/site/robots.py new file mode 100644 index 0000000..ff8f77c --- /dev/null +++ b/src/atextcrawler/site/robots.py @@ -0,0 +1,98 @@ +""" +Fetch and evaluate a website's robots.txt. +""" + +import logging +from typing import Optional, Union +from urllib.robotparser import RobotFileParser + +import aiohttp + +logger = logging.getLogger(__name__) + + +class RobotsInfo(RobotFileParser): + """ + Obtain information from a site's robots.txt. + + After instantiation you must await :meth:`startup`. + """ + + def __init__( + self, + site_url: str, + user_agent: str = '*', + session: aiohttp.ClientSession = None, + ): + super().__init__() + self.__user_agent = user_agent + self.__site_url = site_url.rstrip('/') + self.__robots_url = self.__site_url + '/robots.txt' + self.__timeout = aiohttp.ClientTimeout(sock_connect=2, sock_read=3) + self.__session = session + + def __await__(self): + return self.__ainit__().__await__() + + async def __ainit__(self): + if self.__session: + content = await self.__get_robots_txt(self.__session) + else: + async with aiohttp.ClientSession() as session: + content = await self.__get_robots_txt(session) + self.parse(content.splitlines()) + self.__delay = self.crawl_delay(self.__user_agent) + request_rate = self.request_rate(self.__user_agent) + if request_rate: + self.__delay = request_rate.seconds / request_rate.requests + self.__site_maps = super().site_maps() or [] + return self + + async def __get_robots_txt(self, session: aiohttp.ClientSession) -> str: + """ + Fetch and return the robots.txt over http. + """ + try: + async with session.get( + self.__robots_url, timeout=self.__timeout + ) as resp: + if resp.status == 200: + try: + content = await resp.text() + except: + body = await resp.read() + content = body.decode( + resp.charset or 'utf-8', errors='ignore' + ) + else: + content = '' + except aiohttp.ClientError: + content = '' + return content + + @property + def user_agent(self) -> str: + """ + The user agent being used. + """ + return self.__user_agent + + @property + def delay(self) -> Optional[Union[int, float]]: + """ + The delay to be used between requests. + """ + return self.__delay + + @property + def site_maps(self) -> list[str]: + """ + The list of sitemaps of the site. + """ + return self.__site_maps + + def can_fetch_url(self, url: str) -> bool: + """ + Return whether fetching of the given *url* is allowed. + """ + return super().can_fetch(self.__user_agent, url) diff --git a/src/atextcrawler/site/seed.py b/src/atextcrawler/site/seed.py new file mode 100644 index 0000000..0648b7f --- /dev/null +++ b/src/atextcrawler/site/seed.py @@ -0,0 +1,72 @@ +""" +Seeding of new installations with URLs from blacklists and whitelists. +""" + +from pathlib import Path + +import asyncpg + +from ..utils.durl import Durl + + +async def load_seeds(config: dict, pool: asyncpg.Pool) -> None: + """ + Add seed file contents (site blacklist and whitelist). + + If there are sites already, do nothing. + """ + async with pool.acquire() as conn: + site_count = await conn.fetchval("SELECT count(*) FROM site") + if site_count: + return + + # add blacklist entries + values = [] + blacklist = _load_list(config['config_dir'], 'black') + for base_url in blacklist: + durl = await Durl(base_url) + if durl: + url = durl.site() + values.append((url, {'source': 'seed file'})) + sql = ( + "INSERT INTO site_annotation (base_url, ann_type, ann_content)" + " VALUES ($1, 'blacklist', $2)" + ) + await conn.executemany(sql, values) + + # add whitelist entries + values1 = [] + values2 = [] + whitelist = _load_list(config['config_dir'], 'white') + for base_url in whitelist: + durl = await Durl(base_url) + if durl: + url = durl.site() + if url not in blacklist: + values1.append((url, {'source': 'seed file'})) + values2.append((url,)) + sql = ( + "INSERT INTO site_annotation (base_url, ann_type, ann_content)" + " VALUES ($1, 'whitelist', $2)" + ) + await conn.executemany(sql, values1) + sql = "INSERT INTO site_queue (src, url) VALUES (null, $1)" + await conn.executemany(sql, values2) + + +def _load_list(config_dir, black_white): + """ + Load the seed black or white list. + """ + path = Path(config_dir) / 'initial_data' / f'seed_urls.list' + with open(path, 'r') as list_file: + urls = [] + for line in list_file.read().strip().splitlines(): + line_ = line.strip() + if line_.startswith('#'): + continue + if black_white == 'black' and line_.startswith('-'): + urls.append(line_[1:].strip()) + if black_white == 'white' and line_.startswith('+'): + urls.append(line_[1:].strip()) + return urls diff --git a/src/atextcrawler/tensorflow.py b/src/atextcrawler/tensorflow.py new file mode 100644 index 0000000..197572c --- /dev/null +++ b/src/atextcrawler/tensorflow.py @@ -0,0 +1,69 @@ +""" +Query the tensorflow_model_server's REST API. +""" + +import logging +from typing import Optional, Union + +import aiohttp + +logger = logging.getLogger(__name__) + + +class TensorFlow: + """ + Fetch an embedding vector from the tensorflow model server. + """ + + def __init__( + self, + app, + session: aiohttp.ClientSession, + timeout_sock_connect: Union[int, float] = 0.5, + timeout_sock_read: Union[int, float] = 10, + ): + self.config = app.config['tensorflow'] + self.session = session + self.timeout = aiohttp.ClientTimeout( + sock_connect=timeout_sock_connect, sock_read=timeout_sock_read + ) + + async def embed( + self, text: Union[str, list[str]] + ) -> Optional[Union[list[float], list[list[float]]]]: + """ + Query the tensorflow_model_server's REST API for a prediction. + + Take a string or a list of strings and return an embedding vector + or a list of embedding vectors. + + If the request fails or times out, return None. + """ + text_ = text if isinstance(text, list) else [text] + data = {'signature_name': 'serving_default', 'instances': text_} + try: + async with self.session.post( + self.config['model_server_endpoint'], + json=data, + timeout=self.timeout, + ) as resp: + try: + res = await resp.json() + if isinstance(text, list): + return res.get('predictions') + else: + return res.get('predictions')[0] + except: + msg = 'Got invalid response from tensorflow' + logger.error(msg) + return None + except Exception as err: + msg = 'Could not get embedding from tensorflow for ' + if isinstance(text, str): + msg += f'string of length {len(text)}' + else: + msg += 'list of strings with lengths ' + msg += ','.join([str(len(s)) for s in text]) + msg += f', reason: {err}' + logger.error(msg) + return None diff --git a/src/atextcrawler/utils/__init__.py b/src/atextcrawler/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/atextcrawler/utils/annotation.py b/src/atextcrawler/utils/annotation.py new file mode 100644 index 0000000..24ca149 --- /dev/null +++ b/src/atextcrawler/utils/annotation.py @@ -0,0 +1,481 @@ +""" +Convert html to plain text with annotations over character ranges. +""" + +import re +from collections import defaultdict +from html.parser import HTMLParser + +from .json import json_dumps, json_loads +from .link import nofollow_link_rels +from .tag import keep_tags, self_closing_tags + +MAX_HREF_LENGTH = 200 +""" +Maximum length of an href. Other links are discarded. +""" + + +text_blacklist = [ + 'previous', + 'next', + 'back', # common pagination navigation + '↩︎', # amusewiki footnote separator (after conversion from muse to html) +] +""" +Texts to ignore. +""" + + +class AnnotatingParser(HTMLParser): + """ + Parse tagged text resulting in pure text and annotations. + + The text is available in self.text and the annotations + in self.annotations, which is a dict with these keys: + + * tags: contains a mapping of offset ranges (i, f) to + the tags opening at i and closing at f + * semantic_breaks: a mapping of offset positions where + a new section begins to the nesting level of that + sections; a section is whereever an (opening or closing) + separating tag is placed in the raw html; for the + separating flag of tags see tag.py + * links: a mapping of hrefs to link texts obtained from + anchor (a) tags; we skip hyperref with nofollow rels + * section_ids: map an offset position to the first + id attribute (of any tag) at the beginning of a + semantic section; this can later be used in a URL + fragment for linking directly into this section + + Internally, we put opening tags on self.stack and pop them + when the first matching closing tag is encountered. We assume + balanced tags (tidy html). + + NB: all tags with semantic breaks have sep=True, i.e., + they will have spaces around them so that the semantic breaks + always sit on a space; the semantic break position p is the end + of the last section and the next sections begins at p + 1. + + The text alway begins with a ' ' (added if not in the original), + which is assigned a semantic break with default level 80 + (if there is no semantic break tag at the beginning). + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.text = ' ' # concatenated text data (without tags) + self.pos = 1 # equal to len(self.text) + self.stack = [] + self.tags = defaultdict(dict) + self.semantic_breaks = {0: 80} + self.tag_id = None + self.section_ids = defaultdict(list) + self.links = {} + self.add_space = False + + def close(self): + """ + Finish by collecting results in dict `self.annotations`. + """ + super().close() + self.annotations = {} + self.annotations['links'] = self.links + self.annotations['semantic_breaks'] = { + pos: lvl for pos, lvl in sorted(self.semantic_breaks.items()) + } + self.annotations['tags'] = self.tags + self.annotations['section_ids'] = self.section_ids + + def handle_starttag(self, tag, attrs): + """ + Called for each opening tag. + """ + sep, lvl, sem = keep_tags[tag] + attrs = dict(attrs) + if sep: + self.add_space = True + if tag == 'section' and 'endnotes' in attrs.get('role', ''): + lvl = 25 + # ARIA roles + if role := attrs.get('role'): + if role == 'article': + lvl = 15 + elif role == 'heading': + if aria_level := attrs.get('aria-level'): + if aria_level in (1, 2, 3, 4, 5, 6): + sep, lvl, sem = keep_tags[f'h{aria_level}'] + elif role == 'region': + lvl = 24 + i = self.pos + if tag in self_closing_tags: + # self-closing tags will not be added to the result tags, + # they only appear in semantic_breaks + # the two self-closing tags br and hr both have lvl and sep + if i == 1: # replace the default semantic break at pos 0 + i = 0 + self.add_semantic_break(i, lvl) + i += 1 + if tag_id := attrs.get('id'): + self.tag_id = i, tag_id + self.add_tag_id(i) # br or hr may have an id, too + self.add_space = True + else: + self.stack.append((i, tag, sep, lvl, sem, attrs)) + # forget outdated tag id at new semantic break + if lvl: + self.forget_tag_id() + # memorize tag id + if not self.tag_id and (tag_id := attrs.get('id')): + self.tag_id = self.pos, tag_id + + def handle_endtag(self, tag): + """ + Called for each closing tag. + """ + if not self.stack or (self.stack and self.stack[-1][1] != tag): + return # nothing to do for an already closed self-closing tag + i, tag_, sep, lvl, sem, attrs = self.stack.pop() + f = self.pos + # omit tag without content + if i == f: + return + # for a closing div tag revise lvl to minimum level of contained + # semantic breaks (if any) + if tag == 'div': + min_lvl = 101 + for pos_, lvl_ in reversed(self.semantic_breaks.items()): + if pos_ <= i: + break + min_lvl = min(min_lvl, lvl_) + if min_lvl < 101: + lvl = min_lvl + # add semantic break and an optional section_id + if lvl: + if i == 1: # replace the default semantic break at pos 0 + i = 0 + if tag in ('ul', 'ol', 'li'): + seen_tags = [x[1] for x in self.stack] + if 'p' not in seen_tags: + lvl = 52 + seen_tags.count('tag') + if tag == 'li': + lvl += 1 + self.add_semantic_break(i, lvl) + self.add_tag_id(i) + # do not include surrounding spaces in tag span + if self.text[i] == ' ': + i += 1 + # add tag + self.tags[(i, f)][tag] = sem + # add space (when handling next data) + if sep: + self.add_space = True + # collect links + if tag == 'a': + self.extract_link(i, attrs) + + def handle_data(self, text): + """ + Called for each non-tag content between tags. + """ + # handle empty or blacklisted text + if text == '': + return + if text == ' ': + self.add_space = True + return + if text.strip().lower() in text_blacklist: + if ' ' in text: + self.add_space = True + return + # add a space (at self.pos) if the text begins with one + # or if we shall add one + startswith_space = text.startswith(' ') + text = text.lstrip() + if startswith_space or self.add_space: + if self.text[-1] != ' ': + self.text += ' ' + self.pos += 1 + self.add_space = False + # strip a space at the end of text and handle it in end tag + if text.endswith(' '): + text = text[:-1] + self.add_space = True + # add text to self.text + self.text += text + self.pos += len(text) + + def add_semantic_break(self, pos, lvl): + """ + Add a semantic break of level *lvl* at position *pos*. + """ + if pos in self.semantic_breaks: + self.semantic_breaks[pos] = min(self.semantic_breaks[pos], lvl) + else: + self.semantic_breaks[pos] = lvl + + def forget_tag_id(self): + """ + Reset a tag id if it is too far behind in the text stream. + """ + if self.tag_id: + pos_, tag_id = self.tag_id + if pos_ + 200 < self.pos: + self.tag_id = None + + def add_tag_id(self, pos): + """ + Add and clear an id if the just closing section has none yet. + + *pos* is the start position of the current section, and the + position where the id will be added. + + Add an id only if we are not too far in the section's text already. + """ + if self.tag_id: + pos_, tag_id = self.tag_id + if pos_ < pos + 100 and pos not in self.section_ids: + self.section_ids[pos].append(tag_id.lower()) + self.tag_id = None + + def extract_link(self, i, attrs): + """ + Add a link covering character range (i, self.pos). + + From html *attrs* extract href and rel. + """ + if (href := attrs.get('href')) and not attrs.get('rel') == 'nofollow': + if href.startswith('#'): + return + if len(href) > MAX_HREF_LENGTH: + return + attrs.get('title', '') + if rel := attrs.get('rel'): + if set(rel) & nofollow_link_rels: + return + self.links[href] = i, self.pos, rel + + +def annotate(html): + """ + Split html text into plain text with annotations (from AnnotatingParser). + """ + parser = AnnotatingParser() + parser.reset() + parser.feed(html) + parser.close() + return parser.text, parser.annotations + + +re_footnote = re.compile(r'^\s*\[\d+\]\s+') + + +def headline_probability(text, tags, lvl) -> float: + """ + Estimate the probability that the text with tags is a headline. + + The context is not considered: The question is not whether the + text is a headline for the following text. + """ + text = text.strip() + res = 0.0 + if not text: + return res + if lvl < 60: + return 1.0 + # if 'h1' in tags or 'h2' in tags or 'h3' in tags or\ + # 'h4' in tags or 'h5' in tags or 'h6' in tags or 'center' in tags: + # return 1.0 + if len(text) < 80: + res = 0.7 + else: + res = 0.7 - 0.7 * (len(text) - 80) / 200 + if 'p' in tags: + res -= 0.4 + if 'em' in tags: + res += 0.3 + if 'a' in tags: + res -= 0.1 + if text[-1] in '.:': + res -= 0.3 + res -= 0.1 * text.count(', ') + if re_footnote.match(text): + res -= 0.4 + return max(res, 0.0) + + +def get_tag_counts(tag_names, i, f, tags, text) -> tuple[int, float, float]: + """ + Return the info on the share of characters covered with one of the *tags*. + + Only consider the characters between i and f of string *text*. + + Return the number of tags that have an overlap in the specified region, + the tag density in the region (fraction of covered characters by all), + and the average number of covered chars per tag. + + NB: If more than one tag name is given, then the fractional share + may exceed 1. + """ + if i == f: + return 0, 0.0, 0.0 + tag_count = 0 + covered_chars = 0 + for (s_i, s_f), anns in tags.items(): + if overlap := range_overlap(i, f - 1, s_i, s_f - 1): + for ann in anns: + if ann in tag_names: + tag_count += 1 + covered_chars += overlap[1] - overlap[0] + all_chars = f - i + tag_density = covered_chars * 1.0 / all_chars + avg_text_len = covered_chars * 1.0 / tag_count if tag_count else 0 + return tag_count, tag_density, avg_text_len + + +def range_overlap(i1, f1, i2, f2): + """ + Return the overlap of both ranges (None if there is none). + """ + return None if f1 <= i2 or f2 <= i1 else (max(i1, i2), min(f1, f2)) + + +def annotations_remove_section(annotations, i, f): + """ + Remove section (i, f) from annotations and return result. + """ + new_annotations = {} + d = f - i + if not d: + return annotations + + # relocate tags + new_tags = {} + for (t_i, t_f), anns in annotations['tags'].items(): + n_i, n_f = cut_range(i, f, d, t_i, t_f) + if n_i is not None: + new_tags[(n_i, n_f)] = anns + new_annotations['tags'] = new_tags + + # relocate links + new_links = {} + for href, (l_i, l_f, rel) in annotations['links'].items(): + n_i, n_f = cut_range(i, f, d, l_i, l_f) + if n_i is not None: + new_links[href] = n_i, n_f, rel + + # relocate semantic breaks and section_ids + semantic_breaks = annotations['semantic_breaks'] + section_ids = annotations['section_ids'] + new_semantic_breaks = {} + new_section_ids = {} + for pos in sorted(semantic_breaks.keys()): + level = semantic_breaks[pos] + if i <= pos and pos < f: + continue # discard + elif f <= pos: + new_semantic_breaks[pos - d] = level + if pos in section_ids: + new_section_ids[pos - d] = section_ids[pos] + else: + new_semantic_breaks[pos] = level + if pos in section_ids: + new_section_ids[pos] = section_ids[pos] + + # collect and return results + new_annotations['semantic_breaks'] = new_semantic_breaks + new_annotations['section_ids'] = new_section_ids + new_annotations['links'] = new_links + return new_annotations + + +def cut_range(i, f, d, t_i, t_f): + """ + Return the new coordinates of a text range (t_i,t_f) after cutting (i,f). + + If (t_i,t_f) is fully within (i,f), return None, None. + """ + if t_f < i: + return t_i, t_f + elif t_i < i <= t_f <= f: + return t_i, i + elif t_i < i and f <= t_f: + return t_i, t_f - d + elif i <= t_i and t_f <= f: + return None, None + elif i <= t_i <= f < t_f: + return i, t_f - d + else: # f < t_i + return t_i - d, t_f - d + + +def clean_annotations(annotations: dict) -> None: + """ + Remove void stuff from annotations. + """ + cleaned_tags = {} + for (i, f), anns in annotations['tags'].items(): + if f > i and anns: + cleaned_tags[(i, f)] = anns + annotations['tags'] = cleaned_tags + + +def pack_annotations(annotations): + """ + Pack annotations to a special JSON string, reducing their volume a little. + """ + return json_dumps( + { + 'tags': _pack_tags(annotations['tags']), + 'semantic_breaks': ','.join( + [ + f'{pos}:{level}' + for pos, level in annotations['semantic_breaks'].items() + ] + ), + 'section_ids': annotations['section_ids'], + 'links': annotations['links'], + } + ) + + +def _pack_tags(tags: dict) -> str: + """ + Utility function for packing tag information into a string. + """ + res = '' + for (i, f), anns in tags.items(): + if anns: + anns_ = ','.join([f'{tag}={sem}' for tag, sem in anns.items()]) + res += f'{i}-{f}:{anns_}\n' + return res + + +def unpack_annotations(json_text: str) -> dict: + """ + Unpack tag information from a string. + """ + annotations = json_loads(json_text) + tags = {} + for line in annotations['tags'].split('\n'): + if line: + range_, anns_ = line.split(':') + i, f = range_.split('-') + i = int(i) + f = int(f) + anns = {} + if anns_: + for ann_ in anns_.split(','): + tag_, sem_ = ann_.split('=') + anns[tag_] = sem_ + tags[(i, f)] = anns + semantic_breaks = {} + for sb_ in annotations['semantic_breaks'].split(','): + pos_, lvl_ = sb_.split(':') + semantic_breaks[int(pos_)] = int(lvl_) + return { + 'tags': tags, + 'semantic_breaks': semantic_breaks, + 'section_ids': annotations['section_ids'], + 'links': annotations['links'], + } diff --git a/src/atextcrawler/utils/date_finder.py b/src/atextcrawler/utils/date_finder.py new file mode 100644 index 0000000..a787d2b --- /dev/null +++ b/src/atextcrawler/utils/date_finder.py @@ -0,0 +1,90 @@ +""" +Find date expressions in a string. +""" + +import re +from datetime import datetime +from typing import Optional + +p_day = r'(0?[1-9]|[12][0-9]|3[01])' +p_month = r'(0?[1-9]|1[0-2])' +p_year = r'(20\d\d|19\d\d)' +sep = r'\D{1,2}' +p_t = r'(\D{0,4}([01][0-9]|2[0-3]):([0-5][0-9]))?' + + +format_re = { + 'iso': ( + re.compile(f'(^|\\D){p_year}{sep}{p_month}{sep}{p_day}(\\D{p_t}|$)'), + (1, 2, 3, 6, 7), + ), + 'dmy': ( + re.compile(f'(^|\\D){p_day}{sep}{p_month}{sep}{p_year}(\\D{p_t}|$)'), + (3, 2, 1, 6, 7), + ), + 'mdy': ( + re.compile(f'(^|\\D){p_month}{sep}{p_day}{sep}{p_year}(\\D{p_t}|$)'), + (3, 1, 2, 6, 7), + ), +} + + +lang_format = { + 'de': ('iso', 'dmy'), + 'en': ('iso', 'mdy'), + None: ('iso', 'dmy', 'mdy'), +} + + +def extract_latest_date(text: str, lang: str = None) -> Optional[datetime]: + """ + Extract the latest date compatible with the *lang* from *text*. + + Only consider dates in the past. + """ + dates = extract_dates(text, lang=lang) + return max(dates) if dates else None + + +def extract_dates(text: str, lang: str = None) -> list[datetime]: + """ + Extract dates form a string, optionally limiting formats to a language. + """ + dates = [] + fmts = lang_format.get(lang, lang_format[None]) + for fmt in fmts: + re_, slots = format_re[fmt] + matches = re_.findall(text) + if matches: + for match in matches: + try: + date = datetime( + int(match[slots[0]]), + int(match[slots[1]]), + int(match[slots[2]]), + int(match[slots[3]] or 0), + int(match[slots[4]] or 0), + ) + if date <= datetime.utcnow(): + dates.append(date) + except: + pass + return dates + + +## from htmldate import find_date + +# def extract_last_pub(html): +# """ +# Return an estimate for the time of last content publication from html. +# """ +# # https://stackoverflow.com/questions/57833080/how-to-fix-unicode-strings-with-encoding-declaration-are-not-supported +# lxml_tree = lxml_html.fromstring(bytes(html, encoding='utf8')) +# # publication date (from startpage) +# try: +# date_string = find_date(lxml_tree) +# pd = date.fromisoformat(date_string) +# last_pub = datetime(pd.year, pd.month, pd.day, 12, 0, 0) +# except: +# last_pub = None +# return last_pub diff --git a/src/atextcrawler/utils/durl.py b/src/atextcrawler/utils/durl.py new file mode 100644 index 0000000..7837d68 --- /dev/null +++ b/src/atextcrawler/utils/durl.py @@ -0,0 +1,278 @@ +""" +Hyperlink parsing. +""" + +import logging +from typing import Optional +from urllib.parse import urlsplit + +import tldextract +from async_dns import types +from async_dns.resolver import ProxyResolver +from async_lru import alru_cache + +from .link import in_blacklist + +logger = logging.getLogger(__name__) + + +resolver = ProxyResolver(request_timeout=2) + + +async_dns_logger = logging.getLogger('async_dns') +async_dns_logger.setLevel(logging.WARNING) + + +extract = tldextract.TLDExtract(cache_dir=False) + + +# tldextract uses filelock; set its loglevel to warning +filelock_logger = logging.getLogger('filelock') +filelock_logger.setLevel(logging.WARNING) + + +class Durl: + """ + Decomposed URL, contains :class:`urllib.parse.SplitResult`. + + When constructing this class, it has to be awaited, e.g.: + + my_durl = await Durl('http://www.example.com/whatever') + + The given URL will be decomposed, validated and normalized. + If the URL is invalid, we return None instead of an instance. + + If the given *base* is None, the URL must be absolute and + the hostname must be valid (DNS lookup). + + If the given URL is not absolute, an already decomposed (and thus + valid) *base* Durl must be given; otherwise the URL is invalid. + + The *base* Durl can contain a path (but no arguments or fragments), + in which case the URL - if not absolute - must begin with this path. + + The scheme must be http or https. If the URL begins with '//', + 'http:' is prepended. + + If the hostname is longer than 90 characters, the URL is invalid. + + Default port numbers (80 for http, 443 for https) are removed. + + The hostname is changed to lower case. Spaces in the hostname + make the URL invalid. + + URL fragments are removed. + """ + + _url = None + _base = None + _match_base = False + + def __init__( + self, + url: str, + base: Optional['Durl'] = None, + match_base: bool = False, + ): + self._url = url + self._base = base + self._match_base = match_base + + def __await__(self): + return self.__ainit__().__await__() + + async def __ainit__(self): + res = None + try: + # add missing scheme for urls beginning with '//' + if self._url.startswith('//'): + self._url = 'http:' + self._url + # split the url + durl = urlsplit(self._url) + # remove default port numbers 80, 443 + netloc = durl.netloc + if durl.port == 80 and durl.scheme == 'http': + netloc = netloc.removesuffix(str(durl.port)).rstrip(':') + if durl.port == 443 and durl.scheme == 'https': + netloc = netloc.removesuffix(str(durl.port)).rstrip(':') + if durl.hostname and durl.hostname != durl.netloc.lower(): + user_pass = '' + if durl.username and durl.password: + user_pass = f'{durl.username}:{durl.password}@' + port = '' + if durl.port: + port = f':{durl.port}' + netloc = f'{user_pass}{durl.hostname.lower()}{port}' + durl = durl._replace(netloc=netloc) + + if self._base: + # if missing fill in scheme and netloc from base + if not durl.scheme: + durl = durl._replace(scheme=self._base.scheme) + if not durl.netloc: + durl = durl._replace(netloc=self._base.netloc) + # if match_base, then set res only if the + # url is compatible with base url + if not self._match_base: + res = durl + else: + if durl.netloc == self._base.netloc: + if durl.scheme == self._base.scheme: + if self._base.path not in ('/', ''): + if durl.path.startswith(self._base.path): + res = durl + else: + res = durl + else: + res = durl + except: + logger.exception( + f'Durl init failed url={self._url}' + f' base={self._base} match_base={self._match_base}' + ) + res = None + if res: + res = res._replace(fragment='') + if not res.hostname or len(res.hostname) > 90: + res = None + elif res.scheme not in ('https', 'http'): + res = None + elif ' ' in res.hostname or '.' not in res.hostname: + res = None + elif not (await get_ips(res.hostname)): + res = None + elif not res.path.startswith('/'): + res = res._replace(path='/') + if res: + if res.fragment is None: + res.fragment = '' + self._durl = res + return self + self._durl = None + + def __getattr__(self, attr): + return getattr(self._durl, attr) + + def url(self) -> str: + """ + Return the URL as string. + """ + return self._durl.geturl() + + def pwa(self) -> str: + """ + Return the (base-relative) path with args of the Durl. + """ + if self._base and self._match_base: + path = self._durl.path.removeprefix(self._base.path) + else: + path = self._durl.path + qs = f'?{self._durl.query}' if self._durl.query else '' + return f'{path}{qs}'.lstrip('/') + + def has_path(self) -> bool: + """ + Return whether the Durl has a non-trivil path. + """ + return self._durl.path not in ('/', '') + + def site(self) -> str: + """ + Return the site (base_url). + """ + return f'{self._durl.scheme}://{self._durl.netloc}/' + + def domain(self) -> str: + """ + Return the domain of the Durl (wrong in case of second-level domains). + """ + levels = extract(self._durl.hostname) + return '.'.join(levels[-2:]).lower() + + def replace_scheme(self, scheme: str) -> None: + """ + Replace the scheme (must be 'http' or 'https'). + """ + self._durl = self._durl._replace(scheme=scheme) + + +@alru_cache(maxsize=1000) +async def get_ips(hostname: str) -> set[str]: + """ + Return IPv4 and IPv6 addresses of the given hostname. + """ + ips = set() + for type_ in (types.A, types.AAAA): + try: + res, cached = await resolver.query(hostname, type_) + if res: + if addr := res.get_record([type_]): + ips.add(addr.data) + except: + pass + return ips + + +def get_url_variants(url: str) -> list[str]: + """ + Return variants of the URL. + + Replace http with https and vice versa; + prepend or remove 'www.' to or from the beginning of the hostname. + """ + if url.startswith('http://www.'): + s = url.removeprefix('http://www.') + return [url, f'http://{s}', f'https://www.{s}', f'https://{s}'] + elif url.startswith('http://'): + s = url.removeprefix('http://') + return [url, f'http://www.{s}', f'https://www.{s}', f'https://{s}'] + elif url.startswith('https://www.'): + s = url.removeprefix('https://www.') + return [url, f'https://{s}', f'http://www.{s}', f'http://{s}'] + elif url.startswith('https://'): + s = url.removeprefix('https://') + return [url, f'https://www.{s}', f'http://www.{s}', f'http://{s}'] + else: + return [url] + + +async def assort_links( + links: dict[str, tuple[int, int, list[str]]], + durl: Durl, + text: str, + base_url: str = None, +) -> tuple[ + dict[str, tuple[int, int, list[str]]], + dict[Durl, tuple[list[str], str]], + dict[Durl, tuple[list[str], str]], +]: + """ + Sort links into a cleaned, an internal and an external dict. + + The cleaned dict maps absolute URLs to char ranges and relations. + The internal dict maps absolute URLs to relations and the linked text. + The external dict maps absolute URLs to relations and the linked text. + The relations are link relations, e.g. rel="canonical". + + The base_url is set, it is used to distinguish internal and external + links. If it is not set, the base_url is obtained from *durl*. + """ + res_int = {} + res_ext = {} + if not base_url: + base_url = durl.site().lower() + base_durl = await Durl(base_url) + cleaned_links = {} + for href, (i, f, rel) in links.items(): + durl = await Durl(href, base=base_durl) + if not durl: + continue + if durl.hostname and in_blacklist(durl.hostname): + continue + cleaned_links[durl.url()] = i, f, rel + txt = text[i:f] + if durl.site().lower() == base_url: + res_int[durl] = rel, txt + else: + res_ext[durl] = rel, txt + return cleaned_links, res_int, res_ext diff --git a/src/atextcrawler/utils/html.py b/src/atextcrawler/utils/html.py new file mode 100644 index 0000000..0cfd800 --- /dev/null +++ b/src/atextcrawler/utils/html.py @@ -0,0 +1,136 @@ +""" +Utilities for extracting information from html. +""" + +import re +from html import unescape +from typing import Optional + +from bs4 import BeautifulSoup + +from .lang import clean_lang +from .tag import drop_roles, drop_tags, keep_tags + +re_ = { + 'html_lang': re.compile( + ']*lang\s*=\s*["\']([^"\']*)["\'][^>]*>', re.I | re.S + ), + 'title': re.compile(']*>([^<]*)', re.I | re.S), + 'strip': re.compile( + '<(?!/?(' + '|'.join(keep_tags.keys()) + ')[ >])[^>]+>', re.I | re.S + ), + 'empty_tag': re.compile(r'<(?P\w+)( [^>]*)?>(\s*)', re.S), + 'whitespace': re.compile('(\s| )+', re.S), + 'whitespace_': re.compile('\s| ?'), # allow broken   + 'whitespace_near_tag': re.compile( + '\s*<(br|p|/p|ul|/ul|li|/li|h1|/h1' + '|h2|/h2|h3|/h3|h4|/h4|h5|/h5|h6|/h6)>\s*', + re.S, + ), + 'whitespace_tag_tag': re.compile('(\s+)((<[^>]+>\s+)+)', re.S), + 'whitespace_tag_tag_func': re.compile('(<[^>]+>)\s+', re.S), + 'http_equiv': re.compile('(]*http-equiv[^>]*>)', re.I | re.S), +} + + +def whitespace_tag_tag(match_obj): + """ + Helper function for removing whitespace between tags. + """ + return ' ' + re_['whitespace_tag_tag_func'].sub(r'\1', match_obj.group(2)) + + +def clean_html(s: Optional[str]) -> Optional[str]: + """ + Clean an html string. + + Unescape htmlentities and replace whitespaces with ' ' (ASCII char 0x20). + + See also: https://www.lesinskis.com/python-unicode-whitespace.html + """ + return re_['whitespace_'].sub(' ', unescape(s)).strip() if s else None + + +def get_html_lang(html: str) -> Optional[str]: + """ + Return the language, if any, found in the lang attribute of the html tag. + """ + m = re_['html_lang'].search(html) + return clean_lang(m.group(1)) if m else None + + +def extract_title(html: str) -> Optional[str]: + """ + Extract title tags from html returning their content as a string. + """ + if not (titles := re_['title'].findall(html)): + return None + titles = [clean_html(title) for title in reversed(titles) if title] + return ' - '.join(titles).strip(' |') + + +def clean_page(html): + """ + Remove unwanted tags including their content from html. + + Drop tags in *drop_tags* as well as tags with a role in *drop_roles*. + Also drop tags with attribute aria-hidden=true. + + Return a beautiful soup. + """ + soup = BeautifulSoup(html, 'html.parser') + for tag in drop_tags: + for n in soup.find_all(tag): + n.decompose() + for n in soup.find_all(attrs={'aria-hidden': 'true'}): + n.decompose() + for role in drop_roles: + for n in soup.find_all(attrs={'rel': role}): + n.decompose() + return soup + + +def clean_body(body): + """ + Clean an html body. + + Remove unwanted tags (keeping their content); remove empty tags; + remove and replace whitespaces in several ways. + + In the end the only whitespace is a space and there are no + consecutive spaces. + """ + body = re_['strip'].sub(' ', body) + body = re_['whitespace_near_tag'].sub(r'<\1>', body) + body = re_['whitespace'].sub(' ', body) + while re_['empty_tag'].search(body): + body = re_['empty_tag'].sub(r'\3', body) + body = re_['whitespace_near_tag'].sub(r'<\1>', body) + body = re_['whitespace'].sub(' ', body) + body = re_['whitespace_tag_tag'].sub(whitespace_tag_tag, body) + return body.strip().replace('\u00ad', '') # soft hyphen + + +def get_html_redirect(html: str) -> Optional[str]: + """ + Return an html redirect in an http-equiv meta tag. + + If none is found, return None. + """ + redir_url = None + http_equivs = re_['http_equiv'].findall(html) + for raw in http_equivs: + tag = BeautifulSoup(raw, 'html.parser').meta + if tag and tag.get('http-equiv', '').lower() == 'refresh': + if content := tag.get('content'): + try: + _, redir_url = content.split(';') + redir_url = ( + redir_url.strip() + .removeprefix('url=') + .removeprefix('URL=') + .strip("'") + ) + except: + pass + return redir_url diff --git a/src/atextcrawler/utils/http.py b/src/atextcrawler/utils/http.py new file mode 100644 index 0000000..a6bf4c0 --- /dev/null +++ b/src/atextcrawler/utils/http.py @@ -0,0 +1,58 @@ +""" +Utility functions related to http. +""" + +import re +from typing import Optional + +from multidict import CIMultiDictProxy + +from ..models import Site +from .durl import Durl + +re_ = { + 'link_header': re.compile(',\s*(?=<)'), + 'rel_canonical': re.compile(';\s*rel\s*=\s*["\']?canonical', re.I), + 'rel_shortlink': re.compile(';\s*rel\s*=\s*["\']?shortlink', re.I), +} + + +async def get_header_links( + headers: CIMultiDictProxy, + durl: Durl, + site: Optional[Site], +) -> dict[str, Optional[str]]: + """ + Extract canonical and shortlink links from http headers. + + *durl* must be the Durl of the fetched page and *site* - i fnon None - + must be the Site to which the page belongs. + + Return a (default)dict with 'canonical' and 'shortlink' as keys. + The values default to None. + """ + res = {} + canonical = shortlink = None + if 'link' in headers and (link_headers := headers.getall('link')): + links = [] + for link_header in link_headers: + links += re_['link_header'].split(link_header) + url = durl.url() + base_url = site.base_url if site else url + base_durl = await Durl(base_url) if base_url else None + for link in links: + if not canonical and 'canonical' in link.lower(): + if re_['rel_canonical'].search(link): + canon_url = link.strip().lstrip('<').split('>')[0] + if canon_durl := await Durl(canon_url, base=base_durl): + canonical = canon_durl.url() + if not shortlink and 'shortlink' in link.lower(): + if re_['rel_shortlink'].search(link): + short_url = link.strip().lstrip('<').split('>')[0] + if short_durl := await Durl(short_url, base=base_durl): + shortlink = short_durl.url() + if canonical and shortlink: + break + res['canonical'] = canonical + res['shortlink'] = shortlink + return res diff --git a/src/atextcrawler/utils/json.py b/src/atextcrawler/utils/json.py new file mode 100644 index 0000000..874419f --- /dev/null +++ b/src/atextcrawler/utils/json.py @@ -0,0 +1,32 @@ +""" +Custom JSON encoder. +""" + +import json + + +class JSONEncoderExt(json.JSONEncoder): + """ + Extended JSON encoder with encoding of sets as lists. + """ + + def default(self, obj): + """ + Encode sets as lists and everything else as by default. + """ + if isinstance(obj, set): + return list(obj) + return json.JSONEncoder.default(self, obj) + + +def json_dumps(obj): + """ + Encode an object to a JSON string using JSONEncoderExt. + """ + return json.dumps(obj, cls=JSONEncoderExt) + + +json_loads = json.loads +""" +Decoding of JSON strings as by default. +""" diff --git a/src/atextcrawler/utils/lang.py b/src/atextcrawler/utils/lang.py new file mode 100644 index 0000000..72357c6 --- /dev/null +++ b/src/atextcrawler/utils/lang.py @@ -0,0 +1,44 @@ +""" +Utility functions related to languages. +""" + +from pathlib import Path +from typing import Optional + +import gcld3 + +asset_path = Path(__file__).parent.parent / 'assets' + + +with open(asset_path / 'iso_639-1', 'r') as f: + iso_639_1_codes = f.read().strip().split('\n') + + +lang_detector = gcld3.NNetLanguageIdentifier( + min_num_bytes=0, max_num_bytes=1000 +) + + +def clean_lang(lang: Optional[str]) -> Optional[str]: + """ + Clean a language code string: it must be an ISO 639-1 code or None. + """ + if lang is None: + return None + lang = lang[:2].lower() + if lang in iso_639_1_codes: + return lang + return None + + +def extract_content_language(text: str) -> Optional[str]: + """ + Extract the language from a text. + """ + if len(text) < 10: + return None + lang = None + lang_det = lang_detector.FindLanguage(text=text) + if lang_det.is_reliable: + lang = lang_det.language[:2] + return lang diff --git a/src/atextcrawler/utils/link.py b/src/atextcrawler/utils/link.py new file mode 100644 index 0000000..e3875e7 --- /dev/null +++ b/src/atextcrawler/utils/link.py @@ -0,0 +1,116 @@ +""" +Hyperlinks (a href, link). +""" + +from pathlib import Path +from typing import Optional + +import tldextract + +nofollow_link_rels = set( + [ + 'nofollow', + 'search', + 'noreferrer', + 'noopener', + 'help', + 'license', + ] +) +""" +Do not follow the hrefs in anchor tags with these values of the rel attribute. +""" + + +meta_names = ( + 'generator', + 'lang', + 'language', + 'description', + 'keywords', + 'author', + 'title', + 'subject', + 'revised', + 'abstract', + 'topic', + 'summary', + 'classfication', + 'category', + 'reply-to', + 'owner', + 'url', + 'identifier-URL', + 'geo.position', + 'geo.region', + 'geo.placename', + 'dc.language', +) +""" +Values of the name attribute of meta tags to keep. + +See also: https://gist.github.com/lancejpollard/1978404 +See also: https://github.com/joshbuchea/HEAD +""" + + +meta_props = ( + 'og:site_name', + 'og:locale', + 'og:type', + 'og:latitude', + 'og:longitude', + 'og:street', + 'og:locality', + 'og:region', + 'og:postal', + 'og:country', +) +""" +Values of the property attribute of meta tags to keep. +""" + + +link_rels = set( + [ + 'webmention', + 'pingback', + 'alternate', + 'canonical', + 'author', + ] +) +""" +Values of the rel attribute of link tags to keep. +""" + + +def load_blacklist(): + """ + Return the 10000 most popular internet domains. + """ + path = Path(__file__).parent.parent / 'assets' / 'top_1e4' + with open(path, 'r') as file: + domains = file.read().strip().splitlines() + return domains + + +domain_blacklist = load_blacklist() + + +def in_blacklist(hostname: str) -> Optional[str]: + """ + Return a match of host in the blacklist, or None. + """ + domain = extract_domain(hostname) + if domain in domain_blacklist: + return hostname + return None + + +def extract_domain(hostname: str) -> str: + """ + Extract the lower-case domain from a hostname. + """ + levels = tldextract.extract(hostname) + return '.'.join(levels[-2:]).lower() diff --git a/src/atextcrawler/utils/muse.py b/src/atextcrawler/utils/muse.py new file mode 100644 index 0000000..467122e --- /dev/null +++ b/src/atextcrawler/utils/muse.py @@ -0,0 +1,120 @@ +""" +Parse muse-formatted plaintext (delivered by amusewiki). +""" + +import re +from datetime import datetime +from typing import Optional + +from .date_finder import extract_latest_date +from .lang import clean_lang + +re_tag = re.compile(r'<[^<]+?>') + + +def parse_muse(text: str) -> Optional[tuple[dict, str]]: + """ + Parse a MUSE string returning meta information and the text body. + """ + head, body = split_head_body(text) + if not head: + return None + meta = parse_head(head) + if not meta: + return None + return extract_muse_meta(meta, body), body + + +def split_head_body(text: str) -> tuple[str, str]: + """ + Split a MUSE string into head and body and return both. + """ + head = '' + while text.startswith('#'): + line_end = text.find('\n') + 1 + head += text[:line_end] + text = text[line_end:] + return head.strip(), text.strip() + + +def parse_head(text: str) -> dict: + """ + Parse a MUSE head and return a dict mapping field names to values. + """ + fields = {} + for line in text.split('\n'): + name, value = line.strip().split(' ', 1) + fields[name[1:]] = value + return fields + + +amusewiki_fields = [ + 'author', + 'title', + 'lang', + 'LISTtitle', # reduced title for alphabetical sorting + 'subtitle', + 'SORTauthors', # authors separated by ';' or ',' (only for indexing) + 'SORTtopics', # topics separated by ';' or ',' (only for indexing) + 'date', # publication year + 'pubdate', # publication datetime + 'notes', # additional info (orig title, translators, credits, ...) + 'source', # preferred format: "Retrieved on March 8, 2012 from {URL}" + 'publisher', + 'isbn', + #'rights', + 'seriesname', + 'seriesnumber', + #'hyphenation', # irrelevant + #'slides', # irrelevant + #'DELETED', # irrelevant + #'cover', # irrelevant + #'coverwidth', # irrelevant + #'nocoverpage', # irrelevant + #'notoc', # irrelevant + #'nofinalpage', # irrelevant + #'impressum', # irrelevant + #'continuefootnotes', # irrelevant + #'centerchapter', # irrelevant + #'centersection', # irrelevant +] +""" +Amusewiki fields are (cf. https://amusewiki.org/library/manual) +""" + + +re_list = re.compile('[;,]') + + +def extract_muse_meta(meta, body) -> dict: + """ + Extract meta information from muse header and muse body. + """ + authors = set() + if author := meta.get('author', '').strip(): + authors.add(author) + if sortauthors := meta.get('SORTauthors', '').strip(): + for author in re_list.split(sortauthors): + if author_ := author.strip(): + authors.add(author_) + pubdate = meta.get('pubdate').strip() + pub_date: Optional[datetime] = None + if pubdate: + try: + pub_date = datetime.fromisoformat(pubdate) + except: + pub_date = extract_latest_date(pubdate) + summary = re_tag.sub('', body[:1000].split('\n\n')[0]) + return { + 'title': re_tag.sub('', meta.get('title', '')) or None, + 'authors': authors, + 'lang': clean_lang(meta.get('lang')), + 'keywords': [ + s.strip() + for s in re_list.split(meta.get('SORTtopics', '').strip()) + if s.strip() + ], + 'pub_date': pub_date, + 'summary': summary, + 'orig_source': meta.get('source', '').strip() or None, + } diff --git a/src/atextcrawler/utils/probe.py b/src/atextcrawler/utils/probe.py new file mode 100644 index 0000000..0987433 --- /dev/null +++ b/src/atextcrawler/utils/probe.py @@ -0,0 +1,22 @@ +""" +Utility functions for probing / sampling. +""" + + +def extract_samples(items, n=5): + """ + Extract up to n sample elements from the the given dict or list. + + If *items* is a dict return the elements from the list of keys. + """ + l = len(items) + if l <= n: + return items + poss = [] + step = (l + 1) / n + for i in range(n): + pos = int(step * i) + if pos < l and (not poss or pos > poss[-1]): + poss.append(pos) + items_list = list(items) + return [items_list[pos] for pos in poss] diff --git a/src/atextcrawler/utils/section.py b/src/atextcrawler/utils/section.py new file mode 100644 index 0000000..302f60f --- /dev/null +++ b/src/atextcrawler/utils/section.py @@ -0,0 +1,74 @@ +""" +Operations on text sections. + +Semantic breaks are character positions within a text (0-offset) +where a new section begins. More precisely, the character position +contains a space and only at the next position begins a tag that is +semantically breaking (e.g., a h1 or a br). + +Each semantic break has a level, which means breaking strength. +The lower the level (e.g., h1 has a lower level than h2), the +stronger the break. + +Implicitly, if position 0 has no semantic break, a semantic break +at position 0 with level 80 is added. + +Semantic breaks can be used to split a text into sections. +The lower the maximum level of the semantic breaks taken into account, +the coarser the segmentation and the fewer the sections. +Each section is given the level of the semantic break at ist beginning. + +From another point of view, sections have levels indicating +the segmentation depth. + +The levels for html tags are defined in tag.py. + +The *semantic_breaks* argument in the functions below +is a dict mapping the character position of the semantic break +to the level of a section beginning at this position +(if segmentation is done at this or a higher level). +""" + + +def iter_sections(text, semantic_breaks, max_level=59): + """ + Iterate over sections, limiting to those with a maximum level. + + Yield (start_pos, end_pos, level, text). + *text* is assumed to have the first semantic break at position 0. + """ + n = len(text) + last_pos = 0 + last_level = semantic_breaks.get(0, 80) + for pos, level in sorted(semantic_breaks.items()): + if level <= max_level and last_pos != pos: + yield last_pos, pos, last_level, text[last_pos + 1 : pos] + last_pos = pos + last_level = level + if last_pos < n: + yield last_pos, n, last_level, text[last_pos:] + + +def concat_section_texts(text, semantic_breaks, min_len=2000): + """ + Try to concat consecutive sections into chunks with a minimum length. + + Yield (section_ids, combined_text). + """ + n = len(text) + last_pos = 0 + section_ids = [] + for section_id, pos in enumerate(semantic_breaks.keys()): + if pos >= last_pos + min_len: + if n - pos < min_len: + for id_ in [ + i for i, k in enumerate(semantic_breaks.keys()) if k >= pos + ]: + section_ids.append(id_) + pos = n + yield section_ids, text[last_pos:pos] + last_pos = pos + section_ids = [] + section_ids.append(section_id) + if last_pos < n: + yield section_ids, text[last_pos:] diff --git a/src/atextcrawler/utils/similarity.py b/src/atextcrawler/utils/similarity.py new file mode 100644 index 0000000..b739056 --- /dev/null +++ b/src/atextcrawler/utils/similarity.py @@ -0,0 +1,92 @@ +""" +Text similarity with simhash. +""" + +import logging + +from asyncpg import Connection +from simhash import Simhash, SimhashIndex + +logger = logging.getLogger(__name__) +logger.setLevel(logging.ERROR) + + +postgresql_bigint_offset = 9223372036854775808 +""" +Subtract this number to get a PostgreSQL bigint from a 64bit int. +""" + + +def get_features(txt: str) -> list[str]: + """ + Extract features from string for use with Simhash. + """ + width = 3 + txt = txt.replace(' ', '').lower() + return [txt[i : i + width] for i in range(max(len(txt) - width + 1, 1))] + + +def simhash_to_bigint(simhash: Simhash) -> int: + """ + Convert a simhash to PostgreSQL's bigint value range. + """ + return simhash.value - postgresql_bigint_offset + + +def simhash_from_bigint(bigint: int) -> Simhash: + """ + Convert a simhash from PostgreSQL's bigint to a Simhash instance. + """ + return Simhash(bigint + postgresql_bigint_offset, log=logger) + + +def get_simhash(text: str) -> Simhash: + """ + Return the Simhash of the given text. + """ + return Simhash(get_features(text), log=logger) + + +async def get_simhash_index(conn: Connection, site_id: int) -> SimhashIndex: + """ + Return a simhash index with hashes of all stored resources of the site. + """ + sql = ( + "SELECT r.id, r.simhash FROM site_path sp, resource r" + " WHERE sp.site_id=$1 AND sp.resource_id=r.id" + ) + rows = await conn.fetch(sql, site_id) + objs = [ + ( + str(row['id']), + Simhash(row['simhash'] + postgresql_bigint_offset, log=logger), + ) + for row in rows + ] + return SimhashIndex(objs, k=3, log=logger) + + +def create_simhash( + index: SimhashIndex, + resource_id: int, + simhash_instance: Simhash, +) -> int: + """ + Add a resource with given id and simhash to a simhash index. + + Return the simhash value shifted into PostgreSQL's bigint range. + + (The simhash field of the resource's database entry is not updated.) + """ + index.add(str(resource_id), simhash_instance) + return simhash_to_bigint(simhash_instance) + + +def search_simhash(index: SimhashIndex, simhash_inst: Simhash) -> list[int]: + """ + Return the ids of similar resources from the index. + """ + found = index.get_near_dups(simhash_inst) + if found: + return sorted([int(elem) for elem in found]) + return [] diff --git a/src/atextcrawler/utils/tag.py b/src/atextcrawler/utils/tag.py new file mode 100644 index 0000000..d636928 --- /dev/null +++ b/src/atextcrawler/utils/tag.py @@ -0,0 +1,189 @@ +""" +Information collections related to html tags. +""" + + +drop_tags = [ + 'applet', + 'area', + 'audio', + 'base', + 'basefont', + 'bdi', + 'bdo', + 'button', + 'canvas', + 'code', + 'command', + 'data', + 'datalist', + 'dir', + 'embed', + 'fieldset', + 'figure', + 'form', + 'frame', + 'frameset', + 'iframe', + 'img', + 'input', + 'label', + 'legend', + 'map', + 'menuitem', + 'meter', + 'noframes', + 'noscript', + 'object', + 'optgroup', + 'option', + 'param', + 'picture', + 'progress', + 'rp', + 'rt', + 'ruby', + 'samp', + 'script', + 'select', + 'source', + 'style', + 'svg', + 'template', + 'textarea', + 'track', + 'var', + 'video', +] +""" +Tags to drop, including their content. +""" + + +keep_tags = { + 'a': (0, 0, ''), + 'abbr': (0, 0, 'st'), + 'acronym': (0, 0, 'st'), + 'address': (1, 0, 'm'), + 'article': (1, 15, ''), + 'aside': (1, 0, 'd'), + 'b': (0, 0, 'st'), + 'blockquote': (1, 65, 'q'), + 'br': (1, 80, ''), + 'caption': (1, 68, ''), + 'center': (1, 50, ''), + 'cite': (1, 0, 'd'), + 'col': (1, 75, ''), + 'colgroup': (1, 73, ''), + 'dd': (1, 70, 'li'), + 'del': (0, 0, 'se'), + 'details': (1, 0, 'd'), + 'dfn': (0, 0, 'st'), + 'div': (1, 60, ''), # lvl often revised to min of contained tags + 'dl': (1, 70, 'l'), + 'dt': (1, 70, 'li'), + 'em': (0, 0, 'st'), + 'figcaption': (1, 0, ''), + 'font': (0, 0, 's'), + 'footer': (1, 15, ''), + 'h1': (1, 30, ''), + 'h2': (1, 32, ''), + 'h3': (1, 34, ''), + 'h4': (1, 36, ''), + 'h5': (1, 38, ''), + 'h6': (1, 40, ''), + 'header': (1, 15, ''), + 'hr': (1, 30, ''), + 'i': (0, 0, 'st'), + 'ins': (0, 0, 'se'), + 'li': (1, 75, 'li'), # lvl revised if not inside p + 'main': (1, 10, ''), + 'mark': (0, 0, 's'), + 'nav': (1, 0, ''), # keep for footnotes + 'ol': (1, 70, 'l'), # lvl revised if not inside p + 'p': (1, 60, ''), + 'pre': (1, 65, 'q'), + 'q': (1, 0, 'q'), + 's': (0, 0, ''), + 'section': (1, 24, ''), + 'small': (0, 0, 'd'), + 'span': (0, 0, 's'), + 'strike': (0, 0, 'se'), + 'strong': (0, 0, 'st'), + 'sub': (0, 0, ''), + 'summary': (1, 20, 'm'), + 'sup': (0, 0, ''), + 'table': (1, 65, ''), + 'tbody': (1, 70, ''), + 'td': (1, 78, ''), + 'tfoot': (1, 70, ''), + 'th': (1, 75, ''), + 'thead': (1, 70, ''), + 'time': (0, 0, 'm'), + 'tr': (1, 75, ''), + 'u': (0, 0, 's'), + 'ul': (1, 70, 'l'), # lvl revised if not inside p +} +""" +Tags to keep for annotation, and their properties. + +The properties are: + + * sep: whether to separate text at both sides of the tag with a space + * lvl: structural depth level of content of this tag; + the paragraph level is 60; headings are below 60, listings above; + a div below the tag will usually have the tag's depth + 1 + * sem: semantic categories: zero or more of + * s=span + * l=listing + * i=list_item + * t=term + * e=edit + * d=details + * q=quote + * m=meta + * x=exclude +""" + + +self_closing_tags = ('br', 'hr') +""" +Those among keep_tags which are self-closing. +""" + + +all_self_closing_tags = ( + 'area', + 'base', + 'br', + 'col', + 'embed', + 'hr', + 'img', + 'input', + 'link', + 'meta', + 'param', + 'source', + 'track', + 'wbr', +) +""" +All self-closing tags of the html standard. +""" + + +drop_roles = ( + 'banner', + 'complementary', + 'contentinfo', + 'dialog', + 'figure', + 'form', + 'img', + 'search', + 'switch', +) +""" +Drop tags with these aria roles. +""" diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..b5f7e34 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1,7 @@ +from .annotation import AnnotateTest +from .date_finder import DateFinderTest +from .page import PageCleanTest +from .section import IterSectionTest, AggSectionTest +from .simhash import SimhashTest +from .text import CleanHtmlTest +from .durl import DurlTest diff --git a/tests/annotation.py b/tests/annotation.py new file mode 100644 index 0000000..f82c68d --- /dev/null +++ b/tests/annotation.py @@ -0,0 +1,49 @@ +""" +Test cases for resource type page. +""" + +from unittest import TestCase + +from atextcrawler.utils.annotation import annotate + + +class AnnotateTest(TestCase): + """ + Test annotation. + + Consider that the
and
tags are self-closing. + """ + + def test_annotate_1(self): + s = 'Hello
world' + text, anns = annotate(s) + self.assertEqual(text, ' Hello world') + self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80}) + self.assertEqual(anns['section_ids'], {}) + + def test_annotate_2(self): + s = ' Hello
world ' + text, anns = annotate(s) + self.assertEqual(text, ' Hello world') + self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 80}) + self.assertEqual(anns['section_ids'], {}) + + def test_annotate_3(self): + s = '

Hello world

' + text, anns = annotate(s) + self.assertEqual(text, ' Hello world') + self.assertEqual(anns['semantic_breaks'], {0: 60}) + + def test_annotate_4(self): + s = '

Hello world

' + text, anns = annotate(s) + self.assertEqual(text, ' Hello world') + self.assertEqual(anns['semantic_breaks'], {0: 60}) + self.assertEqual(anns['section_ids'], {0: ['ref1']}) + + def test_annotate_5(self): + s = '
Hello

world

' + text, anns = annotate(s) + self.assertEqual(text, ' Hello world') + self.assertEqual(anns['semantic_breaks'], {0: 80, 6: 60}) + self.assertEqual(anns['section_ids'], {1: ['ref2']}) diff --git a/tests/date_finder.py b/tests/date_finder.py new file mode 100644 index 0000000..add02f1 --- /dev/null +++ b/tests/date_finder.py @@ -0,0 +1,20 @@ +from datetime import datetime +from unittest import TestCase + +from atextcrawler.utils.date_finder import extract_latest_date + + +class DateFinderTest(TestCase): + def test_extract_latest_date(self): + s = 'test 1987-2+1-no' + r = datetime(1987, 2, 1) + self.assertEqual(extract_latest_date(s), r) + s = '2020-04-06, whatever and 1987-2-1, 1/20/2021' + r = datetime(2020, 4, 6) + self.assertEqual(extract_latest_date(s, lang='de'), r) + s = 'test 2022-04-06, whatever and 1987-2-1, 1/20/2021' + r = datetime(2021, 1, 20) + self.assertEqual(extract_latest_date(s, lang='en'), r) + s = '' + r = None + self.assertEqual(extract_latest_date(s), r) diff --git a/tests/durl.py b/tests/durl.py new file mode 100644 index 0000000..f805557 --- /dev/null +++ b/tests/durl.py @@ -0,0 +1,68 @@ +from unittest import IsolatedAsyncioTestCase +import asyncpg +from atextcrawler.utils.durl import Durl +from atextcrawler.config import Config +from atextcrawler.db import PGPool + + +class DurlTest(IsolatedAsyncioTestCase): + async def asyncSetUp(self): + config = Config().get() + self.pool = PGPool(config['postgresql']) + await self.pool.__aenter__() + self.conn = await self.pool.pool.acquire() + + async def test_durl_basic(self): + durl1 = await Durl('https://U:Pw@www.EXAMPLE.com:8000/hello?world#a') + self.assertEqual(durl1.scheme, 'https') + self.assertEqual(durl1.netloc, 'U:Pw@www.example.com:8000') + self.assertEqual(durl1.port, 8000) + self.assertEqual(durl1.path, '/hello') + self.assertEqual(durl1.fragment, '') + self.assertEqual(durl1.pwa(), 'hello?world') + self.assertEqual(durl1.site(), 'https://U:Pw@www.example.com:8000/') + self.assertEqual( + durl1.url(), 'https://U:Pw@www.example.com:8000/' 'hello?world' + ) + self.assertEqual(durl1.has_path(), True) + durl2 = await Durl('http://www.example.com/') + self.assertEqual(durl2.has_path(), False) + durl3 = await Durl('ftp://www.example.com/') + self.assertEqual(durl3, None) + + async def test_durl_with_base(self): + durl1 = await Durl('https://www.example.com') + self.assertEqual(durl1.path, '/') + self.assertEqual(durl1.pwa(), '') + self.assertEqual(durl1.has_path(), False) + durl2 = await Durl('https://www.example.com/hello2', base=durl1) + self.assertEqual(durl2.hostname, 'www.example.com') + self.assertEqual(durl2.path, '/hello2') + self.assertEqual(durl2.pwa(), 'hello2') + durl3 = await Durl('/hello3?x=1', base=durl1) + self.assertEqual(durl3.hostname, 'www.example.com') + self.assertEqual(durl3.path, '/hello3') + self.assertEqual(durl3.pwa(), 'hello3?x=1') + self.assertEqual(durl3.site(), 'https://www.example.com/') + durl4 = await Durl('https://www.kernel.org/', base=durl1) + self.assertEqual(durl4, None) + + async def test_durl_with_base_and_match_base(self): + durl1 = await Durl('https://www.example.com/base/path/') + self.assertEqual(durl1.path, '/base/path/') + self.assertEqual(durl1.pwa(), 'base/path/') + self.assertEqual(durl1.has_path(), True) + durl2 = await Durl( + 'https://www.example.com/base/', base=durl1, match_base=True + ) + self.assertEqual(durl2, None) + durl3 = await Durl( + 'https://www.example.com/base/path/whatever?x=1#a', + base=durl1, + match_base=True, + ) + self.assertEqual(durl3.pwa(), 'whatever?x=1') + + async def asyncTearDown(self): + await self.pool.pool.release(self.conn) + await self.pool.pool.close() diff --git a/tests/page.py b/tests/page.py new file mode 100644 index 0000000..9cb76bc --- /dev/null +++ b/tests/page.py @@ -0,0 +1,24 @@ +""" +Test cases for resource type page. +""" + +from unittest import TestCase +from atextcrawler.utils.html import clean_body + +# from atextcrawler.utils.tag import drop_tags + + +class PageCleanTest(TestCase): + def test_clean_body_1(self): + s = ' Hello world ' + r = 'Hello world' + self.assertEqual(clean_body(s), r) + + +# def test_drop_tags(self): +# s = '
something
else
...
' +# r = drop_tags(s) +# self.assertEqual(r, '') +# s = 'something' +# r = drop_tags(s) +# self.assertEqual(r, '') diff --git a/tests/section.py b/tests/section.py new file mode 100644 index 0000000..be47a8b --- /dev/null +++ b/tests/section.py @@ -0,0 +1,105 @@ +from unittest import TestCase + +from atextcrawler.utils.section import concat_section_texts, iter_sections + + +class IterSectionTest(TestCase): + def test_iter_sections_1(self): + s = 'abcdefghijklmnopqrstuvwxyz' + sb = {0: 80, 5: 2, 15: 1, 20: 3} + sections1 = list(iter_sections(s, sb, max_level=100)) + sections2 = [ + (0, 5, 80, 'bcde'), + (5, 15, 2, 'ghijklmno'), + (15, 20, 1, 'qrst'), + (20, 26, 3, 'uvwxyz'), + ] + self.assertEqual(sections1, sections2) + + def test_iter_sections_2(self): + s = 'abcdefghijklmnopqrstuvwxyz' + sb = {0: 4, 5: 2, 15: 1, 20: 3, 26: 9} + sections1 = list(iter_sections(s, sb, max_level=100)) + sections2 = [ + (0, 5, 4, 'bcde'), + (5, 15, 2, 'ghijklmno'), + (15, 20, 1, 'qrst'), + (20, 26, 3, 'vwxyz'), + ] + self.assertEqual(sections1, sections2) + + def test_iter_sections_3(self): + s = 'abcdefghijklmnopqrstuvwxyz' + sb = {5: 2, 15: 60, 18: 50, 20: 3} + sections1 = list(iter_sections(s, sb, max_level=59)) + sections2 = [ + (0, 5, 80, 'bcde'), + (5, 18, 2, 'ghijklmnopqr'), + (18, 20, 50, 't'), + (20, 26, 3, 'uvwxyz'), + ] + self.assertEqual(sections1, sections2) + + def test_iter_sections_4(self): + s = 'abcdefghijklmnopqrstuvwxyz' + sb = {5: 2, 15: 60, 18: 50, 20: 3, 24: 60} + sections1 = list(iter_sections(s, sb, max_level=59)) + sections2 = [ + (0, 5, 80, 'bcde'), + (5, 18, 2, 'ghijklmnopqr'), + (18, 20, 50, 't'), + (20, 26, 3, 'uvwxyz'), + ] + self.assertEqual(sections1, sections2) + + +class AggSectionTest(TestCase): + def test_concat_sections_1(self): + s = 'abcdefghijklmnopqrstuvwxyz' + sb = {0: 1, 5: 1, 15: 1, 20: 1} + sections1 = list(concat_section_texts(s, sb, min_len=10)) + sections2 = [ + ([0, 1], 'abcdefghijklmno'), + ([2, 3], 'pqrstuvwxyz'), + ] + self.assertEqual(sections1, sections2) + + def test_concat_sections_2(self): + s = 'abcdefghijklmnopqrstuvwxyz' + sb = {0: 1, 2: 1, 10: 1, 20: 1, 26: 1} + sections1 = list(concat_section_texts(s, sb, min_len=10)) + sections2 = [ + ([0, 1], 'abcdefghij'), + ([2, 3, 4], 'klmnopqrstuvwxyz'), + ] + self.assertEqual(sections1, sections2) + + def test_concat_sections_3(self): + s = 'abcdefghijklmnopqrstuvwxyz' + sb = {0: 1, 4: 1, 6: 1, 16: 1, 26: 1} + sections1 = list(concat_section_texts(s, sb, min_len=10)) + sections2 = [ + ([0, 1, 2], 'abcdefghijklmnop'), + ([3, 4], 'qrstuvwxyz'), + ] + self.assertEqual(sections1, sections2) + + def test_concat_sections_4(self): + s = 'abcdefghijklmnopqrstuvwxyz' + sb = {0: 1, 5: 1, 15: 1, 26: 1} + sections1 = list(concat_section_texts(s, sb, min_len=10)) + sections2 = [ + ([0, 1], 'abcdefghijklmno'), + ([2, 3], 'pqrstuvwxyz'), + ] + self.assertEqual(sections1, sections2) + + def test_concat_sections_5(self): + s = 'abcdefghijklmnopqrstuvwxyz' + sb = {0: 1, 5: 1, 12: 1, 22: 1, 23: 1, 24: 1, 26: 1} + sections1 = list(concat_section_texts(s, sb, min_len=10)) + sections2 = [ + ([0, 1], 'abcdefghijkl'), + ([2, 3, 4, 5, 6], 'mnopqrstuvwxyz'), + ] + self.assertEqual(sections1, sections2) diff --git a/tests/simhash.py b/tests/simhash.py new file mode 100644 index 0000000..095d244 --- /dev/null +++ b/tests/simhash.py @@ -0,0 +1,54 @@ +""" +Test cases for text util. +""" + +from unittest import TestCase +from simhash import Simhash, SimhashIndex +from atextcrawler.utils.similarity import ( + create_simhash, + get_features, + get_simhash, + postgresql_bigint_offset, + search_simhash, +) + + +class SimhashTest(TestCase): + """ + Test simhash creation and search. + """ + + def test_search(self): + n1 = int('1111111100000000', 2) + n2 = int('1111111100000111', 2) + n3 = int('1000000000000000', 2) + n4 = int('1000000000000111', 2) + n5 = int('1000001111000000', 2) + objs = [ + ('1', Simhash(n1)), + ('3', Simhash(n3)), + ('4', Simhash(n4)), + ] + index = SimhashIndex(objs, k=3) + found = search_simhash(index, Simhash(n5)) + self.assertEqual(found, []) + found = search_simhash(index, Simhash(n1)) + self.assertEqual(found, [1]) + found = search_simhash(index, Simhash(n2)) + self.assertEqual(found, [1]) + found = search_simhash(index, Simhash(n4)) + self.assertEqual(found, [3, 4]) + + def test_create(self): + index = SimhashIndex([], k=3) + hash_val_1 = create_simhash(index, 101, get_simhash('hello ' * 20)) + hash_val_2 = create_simhash(index, 102, get_simhash('another one')) + simhash_1 = Simhash(hash_val_1 + postgresql_bigint_offset) + simhash_2 = Simhash(hash_val_2 + postgresql_bigint_offset) + found = search_simhash(index, simhash_1) + self.assertEqual(found, [101]) + found = search_simhash(index, simhash_2) + self.assertEqual(found, [102]) + simhash_3 = get_simhash('hello ' * 20 + 'X') + found = search_simhash(index, simhash_3) + self.assertEqual(found, [101]) diff --git a/tests/text.py b/tests/text.py new file mode 100644 index 0000000..2460459 --- /dev/null +++ b/tests/text.py @@ -0,0 +1,65 @@ +""" +Test cases for text util. +""" + +from unittest import TestCase +from atextcrawler.utils.html import clean_page + + +class CleanHtmlTest(TestCase): + """ + Test clean_page. + + Have an eye on self-closing tags (br, hr, ...). + """ + + def test_clean_page_1(self): + s = 'Hello
anything' + r = 'Hello
anything' + self.assertEqual(str(clean_page(s)), r) + + def test_clean_page_2(self): + s = 'Hello
anything' + r = 'Hello
anything' + self.assertEqual(str(clean_page(s)), r) + + def test_clean_page_3(self): + # nesting + s = '--
xx
yy
zz
..' + r = '--..' + self.assertEqual(str(clean_page(s)), r) + + def test_clean_page_4(self): + # aria-hidden + s = '--..' + r = '--..' + self.assertEqual(str(clean_page(s)), r) + s = '--..' + r = '--..' + self.assertEqual(str(clean_page(s)), r) + s = '--

xx

..' + r = '--

xx

..' + self.assertEqual(str(clean_page(s)), r) + s = '--

xx

..' + r = '--

xx

..' + self.assertEqual(str(clean_page(s)), r) + s = '--

xx

..' + r = '--

xx

..' + self.assertEqual(str(clean_page(s)), r) + + def test_clean_page_5(self): + # no removal + s = '--

xxyy

..' + r = '--

xxyy

..' + self.assertEqual(str(clean_page(s)), r) + + def test_clean_page_6(self): + # self-closing tags to be removed + s = '--

xx

\n...

tt

nn' + r = '--

xx

\n...

tt

nn' + self.assertEqual(str(clean_page(s)), r) + + def test_clean_page_7(self): + s = '--nn' + r = '--nn' + self.assertEqual(str(clean_page(s)), r)