Scribd-1 1 1 Tar

pax_global_header##################################################################
#################0000666#0000000#0000000#00000000064#13407400542#0014511#g#########
###################################################################################
########ustar#00root############################root############################000
0000#0000000#######################################################################
###################################################################################
##############52 comment=9ab728f7cabd2d2a3708d758d4f7c209b7722a29
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
#############################################scribd-downloader-
1.1.1/############################################################################0
000775#0000000#0000000#00000000000#13407400542#0015113#5###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################scri
bd-downloader-
1.1.1/.gitignore##################################################################0
000664#0000000#0000000#00000000170#13407400542#0017101#0###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################/*.t
xt
/*.pdf
/*.jpg
/*.jpeg
/*.png
*.pyc
__pycache__/
.cache/
scribd_downloader.egg-info/
build/
dist/
.pytest_cache/
###################################################################################
###################################################################################
###################################################################################
###################################################################################
############################################################scribd-downloader-
1.1.1/.travis.yml#################################################################0
000664#0000000#0000000#00000000225#13407400542#0017223#0###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################dist
: xenial
language: python
sudo: required
python:
- "3.4"
- "3.5"
- "3.6"
- "3.7"
install: pip install -e .
script: python -m pytest test
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###############################scribd-downloader-
1.1.1/LICENSE#####################################################################0
000664#0000000#0000000#00000002060#13407400542#0016116#0###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################MIT
License
Copyright (c) 2016 Ritiek Malhotra
Permission is hereby granted, free of charge, to any person obtaining a copy

of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
#################################################scribd-downloader-
1.1.1/README.rst##################################################################0
000664#0000000#0000000#00000006137#13407400542#0016611#0###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################Scri
bd-Downloader
=================
|PyPi Version| |Build Status|
(I also found an online service https://dlscrib.com/ created by `Erik Fong`_. It

doesn't
use this script as some people seem to think!).
This python script allows downloading of Scribd documents. It does not matter if
the pages
are blurred or require authentication, this script will still do its job.
There are two types of documents on Scribd:

- Documents made up using a collection of images and
- Actual documents where the text can be selected, copied etc.
This script takes a different approach to both of them:
- Documents consisting of a collection of images is straightforward and

this script will simply download the induvidual images which can
be combined to ``.pdf`` by passing ``--pdf`` option to the tool. Simple.
- Actual documents where the text can be selected are hard to tackle.
If we feed such a document to this tool, only the text present in
document will be downloaded. Scribd seems to use javascript to somehow
combine text and images. So far, I haven't been able to combine them
with Python in a way they look like the original document.
Installation
------------
Make sure you're using Python 3 (Python 2 is not supported by a few dependencies).
Then run these commands:
::
$ pip install scribd-downloader
or install the development version with:
::
$ python setup.py install
Usage
-----
::
usage: scribdl [-h] [-i] [-p] URL
Download documents and books from scribd.com
positional arguments:
URL scribd url to download
optional arguments:
-h, --help show this help message and exit
-i, --images download url made up of images
-p, --pdf convert to pdf (*Nix: imagemagick)
Examples
--------
Downloading text from document containing selectable text:

::
$ scribdl https://www.scribd.com/document/55949937/33-Strategies-of-War
(Text will be saved side by side in a ``.md`` file in the current

working directory)
Download document containing images; use the ``--images`` option (the tool cannot
figure out this on its own):
::
$ scribdl -i http://scribd.com/doc/17142797/Case-in-Point
(Images will be saved in the current working directory)
It can now also download complete books by mimicking itself as a premium user!
This will generate an ``.md`` file in the current working directory:
::
$ scribdl https://www.scribd.com/read/189087235/Confessions-of-a-Casting-
Director-Help-Actors-Land-Any-Role-with-Secrets-from-Inside-the-Audition-Room
Pass ``--pdf`` option to convert the generated output to a PDF.
Disclaimer
----------
Downloading books from Scribd for free maybe prohibited. This tool is
meant for educational purposes only. Please support the authors by buying
their titles.
License
-------
``The MIT License``
.. |PyPi Version| image:: https://img.shields.io/pypi/v/scribd-downloader.svg

:target: https://pypi.org/project/scribd-downloader
.. |Build Status| image:: https://travis-ci.org/ritiek/scribd-downloader.svg?

branch=master
:target: https://travis-ci.org/ritiek/scribd-downloader
.. _Erik Fong: mailto:dlscrib@gmail.com

###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
##scribd-downloader-
1.1.1/scribdl/####################################################################0
000775#0000000#0000000#00000000000#13407400542#0016535#5###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################scri
bd-downloader-
1.1.1/scribdl/__init__.py#########################################################0
000664#0000000#0000000#00000000342#13407400542#0020645#0###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################from
.version import __version__
from .downloader import Downloader
from .document import ScribdTextualDocument

from .document import ScribdImageDocument
from .book import ScribdBook
from .pdf_converter import ConvertToPDF

###################################################################################
###################################################################################
###################################################################################
#####################################scribd-downloader-
1.1.1/scribdl/base.py#############################################################0
000664#0000000#0000000#00000000520#13407400542#0020016#0###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################from
abc import ABCMeta, abstractmethod
import six
@six.add_metaclass(ABCMeta)
class ScribdBase:
"""
An Abstract Base Class for Scribd books and documents.
"""
@abstractmethod
def get_content(self):
"""
An abstract method for fetching content off Scribd book or document.
"""
pass
###################################################################################
###################################################################################
##########scribd-downloader-
1.1.1/scribdl/book.py#############################################################0
000664#0000000#0000000#00000013106#13407400542#0020042#0###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################impo
rt requests
import json
import os
import shutil
from .base import ScribdBase
class ScribdBook(ScribdBase):
"""
A class for downloading books off Scribd.
Parameters
----------
url : `str`
A string containing Scribd book URL.
"""
def __init__(self, url):

self.url = url
self.book_id = str(self.get_id())
def _extract_text(self, content, chapter, token):

"""
Extracts text given a block of raw html.
"""
words = []
for word in content["words"]:
if word.get("break_map", None):
words.append(word["break_map"]["text"])
elif word.get("text", None):
words.append(word["text"])
elif word.get("type", None) == "image":
image_url = self._format_image_url(chapter, word["src"], token)
string_text = self._process_image_text(word, image_url)
words.append(string_text)
else:
words += self._extract_text(word, chapter, token)
return words
"""
Processing text and image extraction.
"""
token = self._get_token()
filename = self.book_id + ".md"

chapter = 1
while True:
response = self.fetch_response(chapter, token)
if response.status_code == 403:
token = self._get_token()
response = self.fetch_response(chapter, token)
if response.status_code == 403:
print("No more content being exposed by Scribd!")
break
try:
json_response = json.loads(response.text)
except ValueError:
print("Completed downloading book!")
break
self._extract_text_blocks(json_response, chapter, token, filename)
chapter += 1
return filename
def fetch_response(self, chapter, token):

url = self._format_content_url(chapter, token)
response = requests.get(url)
return response
def _extract_text_blocks(self, response_dict, chapter, token, filename):

"""
Extracts small blocks of raw book text and image
URLs and writes them to a file.
"""
for block in response_dict["blocks"]:
if block["type"] == "text":
string_text = (
" ".join(self._extract_text(block, chapter, token)) + "\n\n"
)
elif block["type"] == "image":
image_url = self._format_image_url(chapter, block["src"], token)
string_text = self._process_image_text(block, image_url)
if block["type"] in ("text", "image"):

print(string_text)
self.save_text(string_text, filename)
def _process_image_text(self, block, image_url):

image_name = block["src"].replace("images/", "")
image_path = os.path.join(self.book_id, image_name)
self._download_image(image_url, image_path)
string_text = "![{}]({})\n\n".format(image_name, image_path)
return string_text
def _download_image(self, url, path):

try:
os.makedirs(os.path.dirname(path))
except OSError:
pass
response = requests.get(url, stream=True)
with open(path, "wb") as out_file:
shutil.copyfileobj(response.raw, out_file)
def _extract_image_path_from_url(self, url):

image_name = url.split("/")[-1].split("?token=")[0]
return os.path.join(self.book_id, image_name)
def _format_content_url(self, chapter, token):

"""
Generates a string which points to a URL containing
the raw book text.
"""
unformatted_url = (
"https://www.scribd.com/scepub/{}/chapters/{}/" "contents.json?
token={}"
)
return unformatted_url.format(self.book_id, chapter, token)
def _format_image_url(self, chapter, image, token):

"""
Generates a string which points to an image URL.
"""
unformatted_url = "https://www.scribd.com/scepub/{}/chapters/{}/" "{}?
token={}"
return unformatted_url.format(self.book_id, chapter, image, token)
def get_id(self):
"""
Extracts the book ID.
"""
splits = self.url.split("/")
for split in splits:
try:
book_id = int(split)
except ValueError:
continue
return book_id
def _get_token(self):
"""
Fetches a uniquely generated token for the current
session.
"""
headers = {
"X-CSRF-Token":
"jfHAQ/LjqJAexQtAkCgWi0hif/sWHi5pXVAHCNsC3GkZocGcHcfETUhZ/Wd+YyY0tEH/zV/hRCOZhyq7Ze
wiMQ=="
}
cookies = {
"_scribd_session":
"eyJzZXNzaW9uX2lkIjoiNTg3N2VjOTAwMGNmOTM5M2IwMGEwY2ExZmI2YTRiOTQiLCJfY3NyZl90b2tlbi
I6ImxGQUIzKzhrYk4xV25QWW43a3N3di93amdEWkovMnBLeE5jdHM3N3UvbGc9IiwiciI6IjE1NDM2Mjk1O
DAiLCJ3b3JkX2lkIjoyNjMzNjM2MzIsInAiOjE1NDI5MzQ4NDMsImxhc3RfcmVhdXRoIjoxNTQzNjI5NTgw
fQ%3D%3D--4f34750fb7295b3b6f26754547c2e1e568da3e86",
"_scribd_expire": "1543629580",
}
data = "data"
token_url = "https://www.scribd.com/read2/
{}/access_token".format(self.book_id)
token = requests.post(token_url, headers=headers, cookies=cookies,
data=data)
return json.loads(token.text)["response"]
def save_text(self, string_text, filename):

"""
Writes text to the passed file.
"""
with open(filename, "a") as f:
f.write(string_text)
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###########################scribd-downloader-
1.1.1/scribdl/command_line.py#####################################################0
000664#0000000#0000000#00000002216#13407400542#0021535#0###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################impo
rt argparse
from .downloader import Downloader
def get_arguments():
"""
Parses arguments off the command-line.
"""
parser = argparse.ArgumentParser(
description="Download documents and books from scribd.com"
)
parser.add_argument("url", metavar="URL", type=str, help="scribd url to

download")
parser.add_argument(
"-i",
"--images",
help="download url made up of images",
action="store_true",
default=False,
)
parser.add_argument(
"-p",
"--pdf",
help="convert to pdf (*Nix: imagemagick)",
action="store_true",
default=False,
)
return parser.parse_args()
def _command_line():
"""
This function that gets executed when called via command-line.
"""
args = get_arguments()
url = args.url
pdf = args.pdf
images = args.images
scribd_link = Downloader(url)
downloaded_content = scribd_link.download(is_image_document=images)
if pdf:
print("\nConverting to {}..".format(downloaded_content.pdf_path))
downloaded_content.to_pdf()
if __name__ == "__main__":
_command_line()
###################################################################################
###################################################################################
###################################################################################
###################################################################################
######################################scribd-downloader-
1.1.1/scribdl/document.py#########################################################0
000664#0000000#0000000#00000014102#13407400542#0020723#0###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################from
bs4 import BeautifulSoup
import requests
import shutil
import os
from abc import abstractmethod
from .base import ScribdBase
from . import internals
class ScribdDocument(ScribdBase):
"""
A base class for downloading documents off Scribd.
Parameters
----------
url : `str`
A string containing Scribd document URL.
"""

self.url = url
response = requests.get(url)
self.soup = BeautifulSoup(response.text, "html.parser")
def get_title(self):
"""
Scrapes the title of the Scribd document.
"""
title = self.soup.find("title").get_text()
return internals.sanitize_title(title)
def _extract_all_jsonp_urls(self):
"""
Extracts all URLs ending with '.jsonp' by parsing the
HTML code.
"""
js_text = self.soup.find_all("script", type="text/javascript")
jsonp_urls = []
for opening in js_text:
for inner_opening in opening:
jsonp = self._extract_jsonp_url(inner_opening)
if jsonp:
jsonp_urls.append(jsonp)
return jsonp_urls
def _extract_jsonp_url(self, inner_opening):

"""
Extracts URLs ending with '.jsonp'. These URLs contain the
raw document text.
"""
portion1 = inner_opening.find("https://")
if portion1 == -1:
jsonp = None
else:
portion2 = inner_opening.find(".jsonp")
jsonp = inner_opening[portion1 : portion2 + 6]
return jsonp
@abstractmethod
"""
An abstract method which will fetch the actual content
found in the '.jsonp' URLs.
"""
pass
class ScribdTextualDocument(ScribdDocument):
"""
A class for downloading textual documents off Scribd.
"""
"""
Generates the filename and processes the text extraction
to this file.
"""
title = self.get_title()
jsonp_urls = self._extract_all_jsonp_urls()
print("Extracting text to " + title + ".md\n")

filename = title + ".md"
self.text_extractor(jsonp_urls, filename)
return filename
"""
"""
def text_extractor(self, jsonp_urls, filename):

"""
Saves text from every '.jsonp' URL.
"""
for jsonp_url in jsonp_urls:
self.save_text(jsonp_url, filename)
def save_text(self, jsonp, filename):

"""
Makes a GET request to the '.jsonp' URL and saves
the text to the passed file.
"""
response = requests.get(jsonp).text
page_no = response[11:12]
response_head = (
(response)
.replace("window.page" + page_no + '_callback(["', "")
.replace("\\n", "")
.replace("\\", "")
.replace('"]);', "")
)
soup_content = BeautifulSoup(response_head, "html.parser")
for x in soup_content.find_all("span", {"class": "a"}):

xtext = internals.fix_encoding(x.get_text())
print(xtext)
extraction = xtext + "\n\n"
with open(filename, "a") as feed:
feed.write(extraction)
class ScribdImageDocument(ScribdDocument):
"""
A class for downloading image documents off Scribd.
"""
"""
Processes the image extraction.
"""
title = self.get_title()
jsonp_urls = self._extract_all_jsonp_urls()
# sometimes images embedded directly in html as well

return self.image_extractor(jsonp_urls, title)
"""
"""
def image_extractor(self, jsonp_urls, initial_filename):

"""
Function for downloading images off '.jsonp' URLs to
filenames.
"""
downloaded_images = self._html_image_extractor(initial_filename)
found = len(downloaded_images) > 0
for jsonp_url in jsonp_urls:
filename = "{}_{}.jpg".format(initial_filename, len(downloaded_images)
+ 1)
self.save_image(jsonp_url, filename, found)
downloaded_images.append(filename)
return downloaded_images
def _html_image_extractor(self, initial_filename):

"""
Extracts images that are directly embedded in the original
HTML page.
"""
downloaded_images = []
absimg = self.soup.find_all("img", {"class": "absimg"}, src=True)
for img in absimg:
filename = "{}_{}.jpg".format(initial_filename, len(downloaded_images)
+ 1)
self.save_image(img["src"], filename, found=False)
downloaded_images.append(filename)
return downloaded_images
def convert_to_image_url(self, url, found):

"""
Gets the image URL corresponding to the '.jsonp' URL.
"""
if url.endswith(".jsonp"):
replacement = url.replace("/pages/", "/images/")
if found:
replacement = replacement.replace(".jsonp", "/000.jpg")
else:
replacement = replacement.replace(".jsonp", ".jpg")
else:
replacement = url
return replacement
def save_image(self, jsonp_url, imagename, found=False):

"""
Skips downloading if the image is already downloaded,
otherwise downloads it locally.
"""
print("Downloading " + imagename)
already_present = os.listdir(".")
if imagename in already_present:
return
url = self.convert_to_image_url(jsonp_url, found)

response = requests.get(url, stream=True)
with open(imagename, "wb") as out_file:
shutil.copyfileobj(response.raw, out_file)
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###############################scribd-downloader-
1.1.1/scribdl/downloader.py#######################################################0
000664#0000000#0000000#00000004465#13407400542#0021256#0###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################from
bs4 import BeautifulSoup
import requests
from .document import ScribdTextualDocument

from .document import ScribdImageDocument
from .book import ScribdBook
from .pdf_converter import ConvertToPDF
class Downloader:
"""
A helper class for downloading books and documents off Scribd.
Parameters
----------
url : `str`
A string containing path to a Scribd URL
"""

self.url = url
self._is_book = self.is_book()
def download(self, is_image_document=None):

"""
Downloads books and documents from Scribd.
Returns an object of `ConvertToPDF` class.
"""
if self._is_book:
content = self._download_book()
else:
if is_image_document is None:
raise TypeError(
"The input URL points to a document. You must specify "
"whether it is an image document or a textual document "
"in the `image_document` parameter."
)
content = self._download_document(is_image_document)
return content
def _download_book(self):
"""
Downloads books off Scribd.
"""
book = ScribdBook(self.url)
md_path = book.get_content()
pdf_path = "{}.pdf".format(book.get_id())
return ConvertToPDF(md_path, pdf_path)
def _download_document(self, image_document):

"""
Downloads textual and image documents off Scribd.
"""
if image_document:
document = ScribdImageDocument(self.url)
else:
document = ScribdTextualDocument(self.url)
content_path = document.get_content()
pdf_path = "{}.pdf".format(document.get_title())
return ConvertToPDF(content_path, pdf_path)
def is_book(self):
"""
Checks whether the passed URL points to a Scribd book
or a Scribd document
"""
response = requests.get(self.url)
soup = BeautifulSoup(response.text, "html.parser")
content_class = soup.find("body")["class"]
matches_with_book = content_class[0] ==
"autogen_class_views_layouts_book_web"
return matches_with_book
###################################################################################
###################################################################################
#####################################scribd-downloader-
1.1.1/scribdl/internals.py########################################################0
000664#0000000#0000000#00000001105#13407400542#0021103#0###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################impo
rt sys
def fix_encoding(query):
"""
Encoding fixes for Python 2 and Python 3 cross-compatibilty.
"""
if sys.version_info > (3, 0):
return query
else:
return query.encode("utf-8")
def sanitize_title(title):
"""
Remove forbidden characters from title that will prevent Windows
from creating directory.
Also change ' ' to '_' to preserve previous behavior.

"""
forbidden_chars = ' *"/\<>:|(),'
replace_char = "_"
for ch in forbidden_chars:
title = title.replace(ch, replace_char)
return title
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
############################scribd-downloader-
1.1.1/scribdl/pdf_converter.py####################################################0
000664#0000000#0000000#00000002434#13407400542#0021752#0###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################from
md2pdf.core import md2pdf
import img2pdf
import os
class ConvertToPDF:
"""
A class for converting downloading books and documents to PDF.
Parameters
----------
input_content : `str`, `list`
A string containing path to a single markdown file
or a list containing paths to many images.
output_content : `str`
Output path of the generated PDF.
"""
def __init__(self, input_content, output_path):

self.input_content = input_content
self.pdf_path = output_path
def to_pdf(self):
"""
Converts to PDF depending upon the type of content,
i.e. images or markdown.
"""
if isinstance(self.input_content, list):
self._images_to_pdf()
else:
self._markdown_to_pdf()
def _markdown_to_pdf(self):
"""
Converts markdown to PDF.
"""
md2pdf(self.pdf_path,
md_file_path=self.input_content,
base_url=os.getcwd())
def _images_to_pdf(self):
"""
Converts images to PDF.
"""
with open(self.pdf_path, "wb") as f:
open_images = [open(img, "rb") for img in self.input_content]
pdf_images = img2pdf.convert(open_images)
f.write(pdf_images)
###################################################################################
###################################################################################
##############################################################scribd-downloader-
1.1.1/scribdl/version.py##########################################################0
000664#0000000#0000000#00000000026#13407400542#0020572#0###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################__ve
rsion__ = "1.1.1"
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###########################################################################scribd-
downloader-
1.1.1/setup.py####################################################################0
000664#0000000#0000000#00000002151#13407400542#0016624#0###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
################################################################################!/u
sr/bin/env python
from setuptools import setup, find_packages

import os
# __version__ comes into namespace from here

with open(os.path.join("scribdl", "version.py")) as version_file:
exec(version_file.read())
with open("README.rst", "r") as f:

long_description = f.read()
setup(name='scribd-downloader',
version=__version__,
description='Download documents/text from scribd.com',
long_description=long_description,
author='Ritiek Malhotra',
author_email='ritiekmalhotra123@gmail.com',
packages = find_packages(),
entry_points={
'console_scripts': [
'scribdl = scribdl.command_line:_command_line',
]
},
url='https://www.github.com/ritiek/scribd-downloader',
keywords=['scribd-downloader', 'documents', 'command-line', 'python'],
license='MIT',
download_url='https://github.com/ritiek/scribd-downloader/archive/v' +
__version__ + '.tar.gz',
classifiers=[],
install_requires=[
'requests',
'BeautifulSoup4',
'img2pdf',
'md2pdf'
]
)
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###########################################################################scribd-
downloader-
1.1.1/test/#######################################################################0
000775#0000000#0000000#00000000000#13407400542#0016072#5###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################scri
bd-downloader-
1.1.1/test/test_base.py###########################################################0
000664#0000000#0000000#00000000337#13407400542#0020420#0###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################from
scribdl.base import ScribdBase
import pytest
class TestOverrideScribdBase(ScribdBase):
pass
def test_abstract_class():
with pytest.raises(TypeError):
x = ScribdBase()
###################################################################################
###################################################################################
###################################################################################
########################################scribd-downloader-
1.1.1/test/test_download.py#######################################################0
000664#0000000#0000000#00000002651#13407400542#0021316#0###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################from
scribdl.downloader import Downloader
import os
import pytest
@pytest.fixture
def cwd_to_tmpdir(tmpdir):
os.chdir(str(tmpdir))
def test_book_download(cwd_to_tmpdir, monkeypatch):

book_url = "https://www.scribd.com/read/262694921/Acting-The-First-Six-Lessons"
book_downloader = Downloader(book_url)
# We don't want to clutter stdout with book contents if this test fails
monkeypatch.setattr("builtins.print", lambda x: None)
md_book = book_downloader.download()
assert os.path.getsize(md_book.input_content) in range(120000, 160000)
md_book.to_pdf()
assert os.path.getsize(md_book.pdf_path) in range(350000, 400000)
def test_text_document_download(cwd_to_tmpdir):
text_doc_url = "https://www.scribd.com/document/96882378/Trademark-License-
Agreement"
text_downloader = Downloader(text_doc_url)
md_doc = text_downloader.download(is_image_document=False)
assert os.path.getsize(md_doc.input_content) in range(1000, 2000)
md_doc.to_pdf()
assert os.path.getsize(md_doc.pdf_path) in range(20000, 31000)
def test_img_document_download(cwd_to_tmpdir):
img_doc_url = "https://www.scribd.com/doc/136711944/Signature-Scanning-and-
Verification-in-Finacle"
img_downloader = Downloader(img_doc_url)
imgs = img_downloader.download(is_image_document=True)
assert len(imgs.input_content) == 2
imgs.to_pdf()
assert os.path.getsize(imgs.pdf_path) in range(140000, 150000)
###################################################################################
####scribd-downloader-
1.1.1/test/test_internals.py######################################################0
000664#0000000#0000000#00000000764#13407400542#0021511#0###########################
#########################################################################ustar#00ro
ot############################root############################0000000#0000000######
###################################################################################
###############################################################################from
scribdl.internals import sanitize_title
from scribdl import internals
import pytest
SANITIZE_TITLE_TEST_TABLE = [
("good_title", "good_title"),
("*bla", "_bla"),
("**free_as_in_<freedom>**", "__free_as_in__freedom___"),
("troller*\"/\<>:|(haha)jojo", "troller_________haha_jojo"),
]
@pytest.mark.parametrize("input_str, expected_str", SANITIZE_TITLE_TEST_TABLE)

def test_sanitize_title(input_str, expected_str):
assert internals.sanitize_title(input_str) == expected_str
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
###################################################################################
#

Scribd-1 1 1 Tar

Cargado por

Información del documento

Título original

Derechos de autor

Formatos disponibles

Compartir este documento

Compartir o incrustar documentos

Opciones para compartir

¿Le pareció útil este documento?

¿Este contenido es inapropiado?

Copyright:

Formatos disponibles

Scribd-1 1 1 Tar

Cargado por

Copyright:

Formatos disponibles

pax_global_header##################################################################

Copyright (c) 2016 Ritiek Malhotra

Permission is hereby granted, free of charge, to any person obtaining a copy

|PyPi Version| |Build Status|

(I also found an online service https://dlscrib.com/ created by `Erik Fong`_. It

There are two types of documents on Scribd:

This script takes a different approach to both of them:

- Documents consisting of a collection of images is straightforward and

$ pip install scribd-downloader

or install the development version with:

$ python setup.py install

usage: scribdl [-h] [-i] [-p] URL

Download documents and books from scribd.com

Downloading text from document containing selectable text:

(Text will be saved side by side in a ``.md`` file in the current

(Images will be saved in the current working directory)

Pass ``--pdf`` option to convert the generated output to a PDF.

``The MIT License``

.. |PyPi Version| image:: https://img.shields.io/pypi/v/scribd-downloader.svg

.. |Build Status| image:: https://travis-ci.org/ritiek/scribd-downloader.svg?

.. _Erik Fong: mailto:dlscrib@gmail.com

from .downloader import Downloader

from .document import ScribdTextualDocument

from .pdf_converter import ConvertToPDF

from .base import ScribdBase

def __init__(self, url):

def _extract_text(self, content, chapter, token):

filename = self.book_id + ".md"

self._extract_text_blocks(json_response, chapter, token, filename)

def fetch_response(self, chapter, token):

def _extract_text_blocks(self, response_dict, chapter, token, filename):

if block["type"] in ("text", "image"):

def _process_image_text(self, block, image_url):

def _download_image(self, url, path):

def _extract_image_path_from_url(self, url):

def _format_content_url(self, chapter, token):

def _format_image_url(self, chapter, image, token):

def save_text(self, string_text, filename):

parser.add_argument("url", metavar="URL", type=str, help="scribd url to

def __init__(self, url):

def _extract_jsonp_url(self, inner_opening):

print("Extracting text to " + title + ".md\n")

def text_extractor(self, jsonp_urls, filename):

def save_text(self, jsonp, filename):

for x in soup_content.find_all("span", {"class": "a"}):

# sometimes images embedded directly in html as well

def image_extractor(self, jsonp_urls, initial_filename):

def _html_image_extractor(self, initial_filename):

def convert_to_image_url(self, url, found):

def save_image(self, jsonp_url, imagename, found=False):

url = self.convert_to_image_url(jsonp_url, found)

from .document import ScribdTextualDocument

from .pdf_converter import ConvertToPDF

def __init__(self, url):

def download(self, is_image_document=None):

def _download_document(self, image_document):

Also change ' ' to '_' to preserve previous behavior.

def __init__(self, input_content, output_path):

from setuptools import setup, find_packages

# __version__ comes into namespace from here

with open("README.rst", "r") as f:

def test_book_download(cwd_to_tmpdir, monkeypatch):

@pytest.mark.parametrize("input_str, expected_str", SANITIZE_TITLE_TEST_TABLE)

def init(self, url):

def init(self, url):

def init(self, url):

def init(self, input_content, output_path):

# version comes into namespace from here