Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Debug #6

Open
wants to merge 43 commits into
base: master
Choose a base branch
from
Open
Changes from 3 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
ffadc77
adding profiling
femalves Jan 4, 2024
8c974e2
removing logs and adding profiling
femalves Jan 4, 2024
4ce2027
removing logs
femalves Jan 4, 2024
0e516bc
removing unnecessary logs
Jan 16, 2024
0c32f67
removing error
Jan 16, 2024
a517463
removing processing error
femalves Jan 16, 2024
19bbf80
fixing logs
femalves Jan 17, 2024
0968580
testing PyMuPDF
femalves Jan 24, 2024
c5108ce
reversing to img2pdf
femalves Feb 5, 2024
242c941
Adding debug log
femalves Mar 11, 2024
68defa3
modifying logs
femalves Mar 11, 2024
9cbd2ed
testing grabbing images as pdfs and merging
femalves Mar 15, 2024
be7150c
checking how fast downloading files is
femalves Mar 15, 2024
40558b1
trying pymupdf with logs
femalves Mar 15, 2024
1e4293b
Revert "modifying logs"
femalves Mar 18, 2024
5429b18
Revert "testing grabbing images as pdfs and merging"
femalves Mar 18, 2024
a3a0dd3
code to download pre-computed PDFs
femalves Apr 1, 2024
7edeba2
adding changes to image proxy
femalves Apr 3, 2024
5646f10
fixing bug
femalves Apr 3, 2024
471385c
adding s to pdf
femalves Apr 4, 2024
838e9d5
checking if bucket name is correct
femalves Apr 5, 2024
5862267
adding log
femalves Apr 5, 2024
0f40801
adding extra logs to capture error
femalves Apr 8, 2024
742dcda
changing s3_utils
femalves Apr 8, 2024
a323319
fixing bug
femalves Apr 8, 2024
762a697
cleaning code, adding tests, removing logs
femalves Apr 10, 2024
a378240
adding gu=ithub actions
femalves Apr 12, 2024
d7c619c
adjusting python version
femalves Apr 15, 2024
a0a8166
upgrading python
femalves Apr 15, 2024
e3c5fd5
solving 2to_3 bug
femalves Apr 15, 2024
a135daf
verbose
femalves Apr 15, 2024
1efabe6
changing psycopg
femalves Apr 15, 2024
da8badd
fixing bug
femalves Apr 15, 2024
9b77585
setuptools < 57
femalves Apr 15, 2024
e0c0756
running without cache
femalves Apr 15, 2024
3d51c30
removing space
femalves Apr 15, 2024
677be7a
removing space
femalves Apr 15, 2024
decd74f
debugging
femalves Apr 15, 2024
4e52569
downgrading to python 3.8
femalves Apr 15, 2024
66624f5
removing no cache
femalves Apr 15, 2024
de8e606
removing upgrade pip
femalves Apr 15, 2024
531dabb
adding quotes
femalves Apr 15, 2024
e553195
giving it another try
femalves Apr 15, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
59 changes: 26 additions & 33 deletions scan_explorer_service/views/image_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
from flask_discoverer import advertise
from urllib import parse as urlparse
import img2pdf
from io import BytesIO
import math
import sys
import requests
from scan_explorer_service.models import Collection, Page, Article
from scan_explorer_service.utils.db_utils import item_thumbnail
from scan_explorer_service.utils.utils import url_for_proxy
import time
import io
import cProfile
import pstats


bp_proxy = Blueprint('proxy', __name__, url_prefix='/image')
Expand All @@ -20,7 +21,6 @@
@bp_proxy.route('/iiif/2/<path:path>', methods=['GET'])
def image_proxy(path):
"""Proxy in between the image server and the user"""
current_app.logger.debug('######## Starting image proxy for image {} ########'.format(path))
req_url = urlparse.urljoin(f'{current_app.config.get("IMAGE_API_BASE_URL")}/', path)
req_headers = {key: value for (key, value) in request.headers if key != 'Host' and key != 'Accept'}

Expand All @@ -29,7 +29,6 @@ def image_proxy(path):

r = requests.request(request.method, req_url, params=request.args, stream=True,
headers=req_headers, allow_redirects=False, data=request.form)
current_app.logger.debug('Response = {r.text}')

excluded_headers = ['content-encoding','content-length', 'transfer-encoding', 'connection']
headers = [(name, value) for (name, value) in r.headers.items() if name.lower() not in excluded_headers]
Expand All @@ -38,29 +37,23 @@ def image_proxy(path):
def generate():
for chunk in r.raw.stream(decode_content=False):
yield chunk
current_app.logger.debug('######## Ending image proxy for image {path} ########')
return Response(generate(), status=r.status_code, headers=headers)


@advertise(scopes=['api'], rate_limit=[5000, 3600*24])
@bp_proxy.route('/thumbnail', methods=['GET'])
def image_proxy_thumbnail():
"""Helper to generate the correct url for a thumbnail given an ID and type"""

current_app.logger.debug('######## Starting image/thumbnail ########')
try:
id = request.args.get('id')
type = request.args.get('type')
current_app.logger.debug(f"Id {id}, Type {type}")
with current_app.session_scope() as session:
thumbnail_path = item_thumbnail(session, id, type)
current_app.logger.debug(f"Thumbnail path {thumbnail_path}")
path = urlparse.urlparse(thumbnail_path).path

remove = urlparse.urlparse(url_for_proxy('proxy.image_proxy', path='')).path
path = path.replace(remove, '')
current_app.logger.debug(f"Path {path}")
current_app.logger.debug('######## Finishing image/thumbnail ########')

return image_proxy(path)
except Exception as e:
current_app.logger.error(f'{e}')
Expand All @@ -70,7 +63,7 @@ def image_proxy_thumbnail():
@bp_proxy.route('/pdf', methods=['GET'])
def pdf_save():
"""Generate a PDF from pages"""
current_app.logger.debug('######## Starting PDF generation process ########')

try:
id = request.args.get('id')
page_start = request.args.get('page_start', 1, int)
Expand All @@ -80,6 +73,8 @@ def pdf_save():
scaling = float(dpi)/ 600
memory_limit = current_app.config.get("IMAGE_PDF_MEMORY_LIMIT")
page_limit = current_app.config.get("IMAGE_PDF_PAGE_LIMIT")

current_app.logger.debug(f'Memory limit: {memory_limit}')

@stream_with_context
def loop_images(id, page_start, page_end):
Expand All @@ -89,8 +84,6 @@ def loop_images(id, page_start, page_end):
item: Union[Article, Collection] = (
session.query(Article).filter(Article.id == id).one_or_none()
or session.query(Collection).filter(Collection.id == id).one_or_none())
current_app.logger.debug('######## Fetching article/collection {item} from the database ########')
fetch_start_time = time.time()
if isinstance(item, Article):
q = session.query(Article).filter(Article.id == item.id).one_or_none()
start_page = q.pages.first().volume_running_page_num
Expand All @@ -103,14 +96,7 @@ def loop_images(id, page_start, page_end):
Page.volume_running_page_num <= page_end).order_by(Page.volume_running_page_num).limit(page_limit)
else:
raise Exception("ID: " + id + " not found")
fetch_end_time = time.time()
fetch_time_elapsed = fetch_end_time - fetch_start_time
current_app.logger.debug(f'Fetching article/collection {item} took {fetch_time_elapsed:.2f} seconds to complete.')



current_app.logger.debug('######## Fetching images ########')
fetch_images_start_time = time.time()

for page in query.all():
n_pages += 1
if n_pages > page_limit:
Expand All @@ -126,18 +112,25 @@ def loop_images(id, page_start, page_end):
path = path.replace(remove, '')
im_data = image_proxy(path).get_data()
memory_sum += sys.getsizeof(im_data)
current_app.logger.debug(f'Page {n_pages}. Image data: {im_data}')
yield im_data
fetch_images_end_time = time.time()
fetch_images_time_elapsed = fetch_images_end_time - fetch_images_start_time
current_app.logger.debug(f'Fetching images for pages took {fetch_images_time_elapsed:.2f} seconds to complete.')

current_app.logger.debug('######## Starting image loop ########')
loop_start_time = time.time()
response = Response(img2pdf.convert([im for im in loop_images(id, page_start, page_end)]), mimetype='application/pdf')
loop_end_time = time.time()
loop_total_time = loop_end_time - loop_start_time
current_app.logger.debug(f'Image loop took {loop_total_time:.2f} seconds to complete.')


profiler = cProfile.Profile()
profiler.enable()
response = Response(img2pdf.convert([im for im in loop_images(id, page_start, page_end)]), mimetype='application/pdf')
profiler.disable()

# Log the profiling information
log_buffer = io.StringIO()
profiler_stats = pstats.Stats(profiler, stream=log_buffer)
profiler_stats.strip_dirs().sort_stats('cumulative', 'calls').print_stats(20)

formatted_stats = log_buffer.getvalue().splitlines()


current_app.logger.debug(f'==================Profiling information========================: \n')
for line in formatted_stats:
current_app.logger.debug(line)
return response
except Exception as e:
return jsonify(Message=str(e)), 400