Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Debug #6

Open
wants to merge 43 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
ffadc77
adding profiling
femalves Jan 4, 2024
8c974e2
removing logs and adding profiling
femalves Jan 4, 2024
4ce2027
removing logs
femalves Jan 4, 2024
0e516bc
removing unnecessary logs
Jan 16, 2024
0c32f67
removing error
Jan 16, 2024
a517463
removing processing error
femalves Jan 16, 2024
19bbf80
fixing logs
femalves Jan 17, 2024
0968580
testing PyMuPDF
femalves Jan 24, 2024
c5108ce
reversing to img2pdf
femalves Feb 5, 2024
242c941
Adding debug log
femalves Mar 11, 2024
68defa3
modifying logs
femalves Mar 11, 2024
9cbd2ed
testing grabbing images as pdfs and merging
femalves Mar 15, 2024
be7150c
checking how fast downloading files is
femalves Mar 15, 2024
40558b1
trying pymupdf with logs
femalves Mar 15, 2024
1e4293b
Revert "modifying logs"
femalves Mar 18, 2024
5429b18
Revert "testing grabbing images as pdfs and merging"
femalves Mar 18, 2024
a3a0dd3
code to download pre-computed PDFs
femalves Apr 1, 2024
7edeba2
adding changes to image proxy
femalves Apr 3, 2024
5646f10
fixing bug
femalves Apr 3, 2024
471385c
adding s to pdf
femalves Apr 4, 2024
838e9d5
checking if bucket name is correct
femalves Apr 5, 2024
5862267
adding log
femalves Apr 5, 2024
0f40801
adding extra logs to capture error
femalves Apr 8, 2024
742dcda
changing s3_utils
femalves Apr 8, 2024
a323319
fixing bug
femalves Apr 8, 2024
762a697
cleaning code, adding tests, removing logs
femalves Apr 10, 2024
a378240
adding gu=ithub actions
femalves Apr 12, 2024
d7c619c
adjusting python version
femalves Apr 15, 2024
a0a8166
upgrading python
femalves Apr 15, 2024
e3c5fd5
solving 2to_3 bug
femalves Apr 15, 2024
a135daf
verbose
femalves Apr 15, 2024
1efabe6
changing psycopg
femalves Apr 15, 2024
da8badd
fixing bug
femalves Apr 15, 2024
9b77585
setuptools < 57
femalves Apr 15, 2024
e0c0756
running without cache
femalves Apr 15, 2024
3d51c30
removing space
femalves Apr 15, 2024
677be7a
removing space
femalves Apr 15, 2024
decd74f
debugging
femalves Apr 15, 2024
4e52569
downgrading to python 3.8
femalves Apr 15, 2024
66624f5
removing no cache
femalves Apr 15, 2024
de8e606
removing upgrade pip
femalves Apr 15, 2024
531dabb
adding quotes
femalves Apr 15, 2024
e553195
giving it another try
femalves Apr 15, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 3 additions & 1 deletion dev-requirements.txt
Expand Up @@ -2,4 +2,6 @@ Flask-Testing==0.8.1
coverage==5.2.1
testing.postgresql==1.3.0
pytest==7.1.2
pytest-cov==3.0.0
pytest-cov==3.0.0
PyMuPDF==1.23.15
PyMuPDFb==1.23.9
4 changes: 3 additions & 1 deletion requirements.txt
Expand Up @@ -12,4 +12,6 @@ opensearch-py==2.0.0
setuptools<58
alembic==1.8.0
img2pdf==0.4.4
appmap>=1.1.0.dev0
appmap>=1.1.0.dev0
PyMuPDF==1.23.15
PyMuPDFb==1.23.9
32 changes: 16 additions & 16 deletions scan_explorer_service/manifest_factory.py
Expand Up @@ -13,36 +13,36 @@ class ManifestFactoryExtended(ManifestFactory):
"""

def create_manifest(self, item: Union[Article, Collection]):
current_app.logger.debug(f"Creating manifest for item: {item}")
# current_app.logger.debug(f"Creating manifest for item: {item}")
manifest = self.manifest(
ident=f'{item.id}/manifest.json', label=item.id)
manifest.description = item.id
manifest.add_sequence(self.create_sequence(item))
for range in self.create_range(item):
manifest.add_range(range)
current_app.logger.debug(f"Manifest created: {manifest}")
# current_app.logger.debug(f"Manifest created: {manifest}")
return manifest

def create_sequence(self, item: Union[Article, Collection]):
current_app.logger.debug(f"Creating sequence for item: {item}")
# current_app.logger.debug(f"Creating sequence for item: {item}")
sequence: Sequence = self.sequence()
current_app.logger.debug(f"Sequence is: {sequence}. Adding canvases to sequence.")
# current_app.logger.debug(f"Sequence is: {sequence}. Adding canvases to sequence.")
for page in item.pages:
current_app.logger.debug(f"Adding canvas to sequence: {page}.")
# current_app.logger.debug(f"Adding canvas to sequence: {page}.")
sequence.add_canvas(self.get_or_create_canvas(page))
current_app.logger.debug(f"Final sequence created: {sequence}")
# current_app.logger.debug(f"Final sequence created: {sequence}")
return sequence

def create_range(self, item: Union[Article, Collection]):
current_app.logger.debug(f"Creating range for item: {item}")
# current_app.logger.debug(f"Creating range for item: {item}")
if isinstance(item, Collection):
return list(chain(*[self.create_range(article) for article in item.articles]))

range: Range = self.range(ident=item.bibcode, label=item.bibcode)
for page in item.pages:
current_app.logger.debug(f"Adding canvas to range: {page}.")
# current_app.logger.debug(f"Adding canvas to range: {page}.")
range.add_canvas(self.get_or_create_canvas(page))
current_app.logger.debug(f"Range created: {[range]}")
# current_app.logger.debug(f"Range created: {[range]}")
return [range]

def get_canvas_dict(self) -> Dict[str, Canvas]:
Expand All @@ -51,7 +51,7 @@ def get_canvas_dict(self) -> Dict[str, Canvas]:
return self.canvas_dict

def get_or_create_canvas(self, page: Page):
current_app.logger.debug(f"Getting or creating canvas for page: {page}")
# current_app.logger.debug(f"Getting or creating canvas for page: {page}")
canvas_dict = self.get_canvas_dict()
if(page.id in canvas_dict.keys()):
return canvas_dict[page.id]
Expand All @@ -70,28 +70,28 @@ def get_or_create_canvas(self, page: Page):
canvas.add_annotation(annotation)
canvas_dict[page.id] = canvas

current_app.logger.debug(f"Canvas created: {canvas}")
# current_app.logger.debug(f"Canvas created: {canvas}")
return canvas

def create_image_annotation(self, page: Page):
current_app.logger.debug(f"Creating image annotation for page: {page}")
# current_app.logger.debug(f"Creating image annotation for page: {page}")
annotation: Annotation = self.annotation(ident=str(page.id))
image: Image = annotation.image(
ident=page.image_path, label=f'p. {page.label}', iiif=True)

# Override default image quality and format set by prezi
image.id = image.id.replace(f'/default.jpg', f'/{page.image_color_quality}.tif')
current_app.logger.debug(f"Image id: {image.id}")
# current_app.logger.debug(f"Image id: {image.id}")
image.format = page.format
image.height = page.height
image.width = page.width
current_app.logger.debug(f"Image annotation created: {annotation}")
# current_app.logger.debug(f"Image annotation created: {annotation}")
return annotation

def add_search_service(self, manifest: Manifest, search_url: str):
current_app.logger.debug(f"Adding search services for manifest {manifest} and search url {search_url}")
# current_app.logger.debug(f"Adding search services for manifest {manifest} and search url {search_url}")
context = 'http://iiif.io/api/search/1/context.json'
profile = 'http://iiif.io/api/search/1/search'

manifest.add_service(ident=search_url, context=context, profile=profile)
current_app.logger.debug(f"Adding search services for manifest {manifest} and search url {search_url}")
# current_app.logger.debug(f"Adding search services for manifest {manifest} and search url {search_url}")
4 changes: 2 additions & 2 deletions scan_explorer_service/utils/utils.py
Expand Up @@ -5,10 +5,10 @@ def url_for_proxy(endpoint: str, **values):
values['_external'] = False

server, prefix = proxy_url()
current_app.logger.debug(f"Server is {server} and prefix is {prefix}.")
# current_app.logger.debug(f"Server is {server} and prefix is {prefix}.")
path = url_for(endpoint, **values).lstrip('/')

current_app.logger.debug(f"Url is {server}/{prefix}/{path}.")
# current_app.logger.debug(f"Url is {server}/{prefix}/{path}.")
return f'{server}/{prefix}/{path}'

def proxy_url():
Expand Down
87 changes: 52 additions & 35 deletions scan_explorer_service/views/image_proxy.py
Expand Up @@ -3,15 +3,17 @@
from flask_discoverer import advertise
from urllib import parse as urlparse
import img2pdf
from io import BytesIO
import math
import sys
import requests
from scan_explorer_service.models import Collection, Page, Article
from scan_explorer_service.utils.db_utils import item_thumbnail
from scan_explorer_service.utils.utils import url_for_proxy
import time

import io
import cProfile
import pstats
import fitz
import json

bp_proxy = Blueprint('proxy', __name__, url_prefix='/image')

Expand All @@ -20,16 +22,28 @@
@bp_proxy.route('/iiif/2/<path:path>', methods=['GET'])
def image_proxy(path):
"""Proxy in between the image server and the user"""
current_app.logger.debug('######## Starting image proxy for image {} ########'.format(path))
req_url = urlparse.urljoin(f'{current_app.config.get("IMAGE_API_BASE_URL")}/', path)
req_headers = {key: value for (key, value) in request.headers if key != 'Host' and key != 'Accept'}

req_headers['X-Forwarded-Host'] = current_app.config.get('PROXY_SERVER')
req_headers['X-Forwarded-Path'] = current_app.config.get('PROXY_PREFIX').rstrip('/') + '/image'
request_method = request.method
params = request.args
data = request.form


current_app.logger.debug('---------------')
current_app.logger.debug(f'Request method: {request_method}')
current_app.logger.debug(f'Request url: {req_url}')
current_app.logger.debug(f'Request headers: {req_headers}')
current_app.logger.debug(f'Params: {params}')
current_app.logger.debug(f'Data: {json.dumps(data, indent=4)}')
current_app.logger.debug('---------------')

r = requests.request(request.method, req_url, params=request.args, stream=True,
headers=req_headers, allow_redirects=False, data=request.form)
current_app.logger.debug('Response = {r.text}')



excluded_headers = ['content-encoding','content-length', 'transfer-encoding', 'connection']
headers = [(name, value) for (name, value) in r.headers.items() if name.lower() not in excluded_headers]
Expand All @@ -38,29 +52,23 @@ def image_proxy(path):
def generate():
for chunk in r.raw.stream(decode_content=False):
yield chunk
current_app.logger.debug('######## Ending image proxy for image {path} ########')
return Response(generate(), status=r.status_code, headers=headers)


@advertise(scopes=['api'], rate_limit=[5000, 3600*24])
@bp_proxy.route('/thumbnail', methods=['GET'])
def image_proxy_thumbnail():
"""Helper to generate the correct url for a thumbnail given an ID and type"""

current_app.logger.debug('######## Starting image/thumbnail ########')
try:
id = request.args.get('id')
type = request.args.get('type')
current_app.logger.debug(f"Id {id}, Type {type}")
with current_app.session_scope() as session:
thumbnail_path = item_thumbnail(session, id, type)
current_app.logger.debug(f"Thumbnail path {thumbnail_path}")
path = urlparse.urlparse(thumbnail_path).path

remove = urlparse.urlparse(url_for_proxy('proxy.image_proxy', path='')).path
path = path.replace(remove, '')
current_app.logger.debug(f"Path {path}")
current_app.logger.debug('######## Finishing image/thumbnail ########')

return image_proxy(path)
except Exception as e:
current_app.logger.error(f'{e}')
Expand All @@ -70,8 +78,11 @@ def image_proxy_thumbnail():
@bp_proxy.route('/pdf', methods=['GET'])
def pdf_save():
"""Generate a PDF from pages"""
current_app.logger.debug('######## Starting PDF generation process ########')

try:
profiler = cProfile.Profile()
profiler.enable()

id = request.args.get('id')
page_start = request.args.get('page_start', 1, int)
page_end = request.args.get('page_end', math.inf, int)
Expand All @@ -80,17 +91,19 @@ def pdf_save():
scaling = float(dpi)/ 600
memory_limit = current_app.config.get("IMAGE_PDF_MEMORY_LIMIT")
page_limit = current_app.config.get("IMAGE_PDF_PAGE_LIMIT")

current_app.logger.debug(f'Memory limit: {memory_limit}')

@stream_with_context
def loop_images(id, page_start, page_end):
n_pages = 0
memory_sum = 0
current_app.logger.debug("Starting...")
with current_app.session_scope() as session:
current_app.logger.debug(f'Fetching items')
item: Union[Article, Collection] = (
session.query(Article).filter(Article.id == id).one_or_none()
or session.query(Collection).filter(Collection.id == id).one_or_none())
current_app.logger.debug('######## Fetching article/collection {item} from the database ########')
fetch_start_time = time.time()
if isinstance(item, Article):
q = session.query(Article).filter(Article.id == item.id).one_or_none()
start_page = q.pages.first().volume_running_page_num
Expand All @@ -103,19 +116,17 @@ def loop_images(id, page_start, page_end):
Page.volume_running_page_num <= page_end).order_by(Page.volume_running_page_num).limit(page_limit)
else:
raise Exception("ID: " + id + " not found")
fetch_end_time = time.time()
fetch_time_elapsed = fetch_end_time - fetch_start_time
current_app.logger.debug(f'Fetching article/collection {item} took {fetch_time_elapsed:.2f} seconds to complete.')



current_app.logger.debug('######## Fetching images ########')
fetch_images_start_time = time.time()

for page in query.all():

n_pages += 1

current_app.logger.debug(f"Generating image for page: {n_pages}")
current_app.logger.debug(f'Id: {page.id}, Volume_page: {page.volume_running_page_num}, memory: {memory_sum}')
if n_pages > page_limit:
break
if memory_sum > memory_limit:
current_app.logger.debug(f"Memory limit reached: {memory_sum} > {memory_limit}")
break
size = 'full'
if dpi != 600:
Expand All @@ -124,20 +135,26 @@ def loop_images(id, page_start, page_end):
path = urlparse.urlparse(image_url).path
remove = urlparse.urlparse(url_for_proxy('proxy.image_proxy', path='')).path
path = path.replace(remove, '')
current_app.logger.debug(f"Getting image data...: {n_pages}")
im_data = image_proxy(path).get_data()
memory_sum += sys.getsizeof(im_data)
current_app.logger.debug(f'Page {n_pages}. Image data: {im_data}')
yield im_data
fetch_images_end_time = time.time()
fetch_images_time_elapsed = fetch_images_end_time - fetch_images_start_time
current_app.logger.debug(f'Fetching images for pages took {fetch_images_time_elapsed:.2f} seconds to complete.')

current_app.logger.debug('######## Starting image loop ########')
loop_start_time = time.time()
response = Response(img2pdf.convert([im for im in loop_images(id, page_start, page_end)]), mimetype='application/pdf')
loop_end_time = time.time()
loop_total_time = loop_end_time - loop_start_time
current_app.logger.debug(f'Image loop took {loop_total_time:.2f} seconds to complete.')


response = Response(img2pdf.convert([im for im in loop_images(id, page_start, page_end)]), mimetype='application/pdf')
profiler.disable()

# Log the profiling information
log_buffer = io.StringIO()
profiler_stats = pstats.Stats(profiler, stream=log_buffer)
profiler_stats.strip_dirs().sort_stats('cumulative', 'calls').print_stats(20)

formatted_stats = log_buffer.getvalue().splitlines()


current_app.logger.debug(f'==================Profiling information========================: \n')
for line in formatted_stats:
current_app.logger.debug(line)

return response
except Exception as e:
return jsonify(Message=str(e)), 400
14 changes: 7 additions & 7 deletions scan_explorer_service/views/manifest.py
Expand Up @@ -26,25 +26,25 @@ def before_request():
def get_manifest(id: str):
""" Creates an IIIF manifest from an article or Collection"""

current_app.logger.debug(f"Fetching manifest for item with id : {id}")
# current_app.logger.debug(f"Fetching manifest for item with id : {id}")
with current_app.session_scope() as session:
current_app.logger.debug(f"Fetching item (Article/Collection).")
# current_app.logger.debug(f"Fetching item (Article/Collection).")
item: Union[Article, Collection] = (
session.query(Article).filter(Article.id == id).one_or_none()
or session.query(Collection).filter(Collection.id == id).one_or_none())

if item:
current_app.logger.debug(f"Item successfully fetched. Creating manifest for item: {item}")
# current_app.logger.debug(f"Item successfully fetched. Creating manifest for item: {item}")
manifest = manifest_factory.create_manifest(item)
current_app.logger.debug(f"Getting url for proxy for endpoint manifest.search and id {id}")
# current_app.logger.debug(f"Getting url for proxy for endpoint manifest.search and id {id}")
search_url = url_for_proxy('manifest.search', id=id)
current_app.logger.debug(f"Getting url for proxy for endpoint manifest.search and id {id}")
# current_app.logger.debug(f"Getting url for proxy for endpoint manifest.search and id {id}")
manifest_factory.add_search_service(manifest, search_url)
current_app.logger.debug(f"Manifest generated successfully for ID: {id}")
# current_app.logger.debug(f"Manifest generated successfully for ID: {id}")

return manifest.toJSON(top=True)
else:
current_app.logger.debug(f"Item not found for article with id: {id}")
# current_app.logger.debug(f"Item not found for article with id: {id}")
return jsonify(exception='Article not found'), 404


Expand Down