Bash Script to scale and/or resize PDFs from the command line.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

504 lines
16 KiB

  1. #!/usr/bin/env bash
  2. # pdfScale.sh
  3. #
  4. # Scale PDF to specified percentage of original size.
  5. #
  6. # Gustavo Arnosti Neves - 2016 / 07 / 10
  7. #
  8. # This script: https://github.com/tavinus/pdfScale
  9. # Based on: http://ma.juii.net/blog/scale-page-content-of-pdf-files
  10. # And: https://gist.github.com/MichaelJCole/86e4968dbfc13256228a
  11. ###################################################
  12. # PAGESIZE LOGIC
  13. # 1- Try to get Mediabox with CAT/GREP
  14. # Remove /BBox search as it is unreliable
  15. # 2- MacOS => try to use mdls
  16. # Linux => try to use pdfinfo
  17. # 3- Try to use identify (imagemagick)
  18. # 4- Fail
  19. # Remove postscript method,
  20. # may have licensing problems
  21. ###################################################
  22. VERSION="1.4.6"
  23. SCALE="0.95" # scaling factor (0.95 = 95%, e.g.)
  24. VERBOSE=0 # verbosity Level
  25. BASENAME="$(basename $0)" # simplified name of this script
  26. # Set with which after we check dependencies
  27. GSBIN="" # GhostScript Binaries
  28. BCBIN="" # BC Math binary
  29. IDBIN="" # Identify Binary
  30. PDFINFOBIN="" # PDF Info Binary
  31. MDLSBIN="" # MacOS mdls binary
  32. OSNAME="$(uname 2>/dev/null)" # Check where we are running
  33. LC_MEASUREMENT="C" # To make sure our numbers have .decimals
  34. LC_ALL="C" # Some languages use , as decimal token
  35. LC_CTYPE="C"
  36. LC_NUMERIC="C"
  37. TRUE=0 # Silly stuff
  38. FALSE=1
  39. ADAPTIVEMODE=$TRUE # Automatically try to guess best mode
  40. MODE=""
  41. USEIMGMGK=$FALSE # ImageMagick Flag, will use identify if true
  42. USECATGREP=$FALSE # Use old cat + grep method
  43. # Prints version
  44. printVersion() {
  45. if [[ $1 -eq 2 ]]; then
  46. echo >&2 "$BASENAME v$VERSION"
  47. else
  48. echo "$BASENAME v$VERSION"
  49. fi
  50. }
  51. # Prints help info
  52. printHelp() {
  53. printVersion
  54. echo "
  55. Usage: $BASENAME [-v] [-s <factor>] [-i|-c] <inFile.pdf> [outfile.pdf]
  56. $BASENAME -h
  57. $BASENAME -V
  58. Parameters:
  59. -v Verbose mode, prints extra information
  60. Use twice for even more information
  61. -h Print this help to screen and exits
  62. -V Prints version to screen and exits
  63. -m <mode> Force a mode of page size detection.
  64. Will disable the Adaptive Mode.
  65. -s <factor> Changes the scaling factor, defaults to 0.95
  66. MUST be a number bigger than zero.
  67. Eg. -s 0.8 for 80% of the original size
  68. Modes:
  69. a, adaptive Default mode, tries all the methods below
  70. c, cat+grep Forces the use of the cat + grep method
  71. m, mdls Forces the use of MacOS Quartz mdls
  72. p, pdfinfo Forces the use of Linux PdfInfo
  73. i, identify Forces the use of ImageMagick's Identify
  74. Notes:
  75. - Page size detection will try different modes until it gets
  76. a page size, or you can force a mode with -m 'mode'
  77. - Options must be passed before the file names to be parsed
  78. - The output filename is optional. If no file name is passed
  79. the output file will have the same name/destination of the
  80. input file, with .SCALED.pdf at the end (instead of just .pdf)
  81. - Having the extension .pdf on the output file name is optional,
  82. it will be added if not present
  83. - Should handle file names with spaces without problems
  84. - The scaling is centered and using a scale bigger than 1 may
  85. result on cropping parts of the pdf.
  86. Examples:
  87. $BASENAME myPdfFile.pdf
  88. $BASENAME myPdfFile.pdf myScaledPdf
  89. $BASENAME -v -v myPdfFile.pdf
  90. $BASENAME -s 0.85 myPdfFile.pdf myScaledPdf.pdf
  91. $BASENAME -m pdfinfo -s 0.80 -v myPdfFile.pdf
  92. $BASENAME -v -v -m i -s 0.7 myPdfFile.pdf
  93. $BASENAME -h
  94. "
  95. }
  96. # Prints usage info
  97. usage() {
  98. printVersion 2
  99. echo >&2 "Usage: $BASENAME [-v] [-s <factor>] [-i|-c] <inFile.pdf> [outfile.pdf]"
  100. echo >&2 "Try: $BASENAME -h # for help"
  101. exit 1
  102. }
  103. # Prints Verbose information
  104. vprint() {
  105. [[ $VERBOSE -eq 0 ]] && return 0
  106. timestamp=""
  107. [[ $VERBOSE -gt 1 ]] && timestamp="$(date +%Y-%m-%d:%H:%M:%S) | "
  108. echo "$timestamp$1"
  109. }
  110. # Prints dependency information and aborts execution
  111. printDependency() {
  112. printVersion 2
  113. echo >&2 $'\n'"ERROR! You need to install the package '$1'"$'\n'
  114. echo >&2 "Linux apt-get.: sudo apt-get install $1"
  115. echo >&2 "Linux yum.....: sudo yum install $1"
  116. echo >&2 "MacOS homebrew: brew install $1"
  117. echo >&2 $'\n'"Aborting..."
  118. exit 3
  119. }
  120. # Parses and validates the scaling factor
  121. parseScale() {
  122. if ! [[ -n "$1" && "$1" =~ ^-?[0-9]*([.][0-9]+)?$ && (($1 > 0 )) ]] ; then
  123. echo >&2 "Invalid factor: $1"
  124. echo >&2 "The factor must be a floating point number greater than 0"
  125. echo >&2 "Example: for 80% use 0.8"
  126. exit 2
  127. fi
  128. SCALE=$1
  129. }
  130. # Parse a forced mode of operation
  131. parseMode() {
  132. if [[ -z $1 ]]; then
  133. echo "Mode is empty, please specify the desired mode"
  134. echo "Falling back to adaptive mode!"
  135. ADAPTIVEMODE=$TRUE
  136. MODE=""
  137. return $FALSE
  138. fi
  139. if [[ $1 = 'c' || $1 = 'catgrep' || $1 = 'cat+grep' || $1 = 'CatGrep' || $1 = 'C' || $1 = 'CATGREP' ]]; then
  140. ADAPTIVEMODE=$FALSE
  141. MODE="CATGREP"
  142. return $TRUE
  143. elif [[ $1 = 'i' || $1 = 'imagemagick' || $1 = 'identify' || $1 = 'ImageMagick' || $1 = 'Identify' || $1 = 'I' || $1 = 'IDENTIFY' ]]; then
  144. ADAPTIVEMODE=$FALSE
  145. MODE="IDENTIFY"
  146. return $TRUE
  147. elif [[ $1 = 'm' || $1 = 'mdls' || $1 = 'MDLS' || $1 = 'quartz' || $1 = 'mac' || $1 = 'M' ]]; then
  148. ADAPTIVEMODE=$FALSE
  149. MODE="MDLS"
  150. return $TRUE
  151. elif [[ $1 = 'p' || $1 = 'pdfinfo' || $1 = 'PDFINFO' || $1 = 'PdfInfo' || $1 = 'P' ]]; then
  152. ADAPTIVEMODE=$FALSE
  153. MODE="PDFINFO"
  154. return $TRUE
  155. elif [[ $1 = 'a' || $1 = 'adaptive' || $1 = 'automatic' || $1 = 'A' || $1 = 'ADAPTIVE' || $1 = 'AUTOMATIC' ]]; then
  156. ADAPTIVEMODE=$TRUE
  157. MODE=""
  158. return $TRUE
  159. else
  160. echo "Invalid mode: $1"
  161. echo "Falling back to adaptive mode!"
  162. ADAPTIVEMODE=$TRUE
  163. MODE=""
  164. return $FALSE
  165. fi
  166. return $FALSE
  167. }
  168. # Gets page size using imagemagick's identify
  169. getPageSizeImagemagick() {
  170. # Sanity
  171. if [[ ! -f $IDBIN && $ADAPTIVEMODE = $FALSE ]]; then
  172. echo "Error! ImageMagick's Identify was not found!"
  173. echo "Make sure you installed ImageMagick and have identify on your \$PATH"
  174. echo "Aborting! You may want to try the adaptive mode."
  175. exit 15
  176. elif [[ ! -f $IDBIN && $ADAPTIVEMODE = $TRUE ]]; then
  177. return $FALSE
  178. fi
  179. # get data from image magick
  180. local identify="$("$IDBIN" -format '%[fx:w] %[fx:h]BREAKME' "$INFILEPDF" 2>/dev/null)"
  181. # No page size data available
  182. if [[ -z $identify && $ADAPTIVEMODE = $FALSE ]]; then
  183. echo "Error when reading input file!"
  184. echo "Could not determine the page size!"
  185. echo "ImageMagicks's Identify returned an empty string!"
  186. echo "Aborting! You may want to try the adaptive mode."
  187. exit 15
  188. elif [[ -z $identify && $ADAPTIVEMODE = $TRUE ]]; then
  189. return $FALSE
  190. fi
  191. identify="${identify%%BREAKME*}" # get page size only for 1st page
  192. identify=($identify) # make it an array
  193. PGWIDTH=$(printf '%.0f' "${identify[0]}") # assign
  194. PGHEIGHT=$(printf '%.0f' "${identify[1]}") # assign
  195. }
  196. # Gets page size using Mac Quarts mdls
  197. getPageSizeMdls() {
  198. # Sanity
  199. if [[ ! -f $MDLSBIN && $ADAPTIVEMODE = $FALSE ]]; then
  200. echo "Error! Mac Quartz mdls was not found!"
  201. echo "Are you even trying this on a Mac?"
  202. echo "Aborting! You may want to try the adaptive mode."
  203. exit 15
  204. elif [[ ! -f $MDLSBIN && $ADAPTIVEMODE = $TRUE ]]; then
  205. return $FALSE
  206. fi
  207. # get data from mdls
  208. local identify="$("$MDLSBIN" -mdls -name kMDItemPageHeight -name kMDItemPageWidth "$INFILEPDF" 2>/dev/null)"
  209. if [[ -z $identify && $ADAPTIVEMODE = $FALSE ]]; then
  210. echo "Error when reading input file!"
  211. echo "Could not determine the page size!"
  212. echo "Mac Quartz mdls returned an empty string!"
  213. echo "Aborting! You may want to try the adaptive mode."
  214. exit 15
  215. elif [[ -z $identify && $ADAPTIVEMODE = $TRUE ]]; then
  216. return $FALSE
  217. fi
  218. identify=${identify//$'\t'/ } # change tab to space
  219. identify=($identify) # make it an array
  220. PGWIDTH=$(printf '%.0f' "${identify[2]}") # assign
  221. PGHEIGHT=$(printf '%.0f' "${identify[5]}") # assign
  222. }
  223. # Gets page size using Linux PdfInfo
  224. getPageSizePdfInfo() {
  225. # Sanity
  226. if [[ ! -f $PDFINFOBIN && $ADAPTIVEMODE = $FALSE ]]; then
  227. echo "Error! Linux pdfinfo was not found!"
  228. echo "Do you have pdfinfo installed and available on your \$PATH?"
  229. echo "Aborting! You may want to try the adaptive mode."
  230. exit 15
  231. elif [[ ! -f $PDFINFOBIN && $ADAPTIVEMODE = $TRUE ]]; then
  232. return $FALSE
  233. fi
  234. # get data from image magick
  235. local identify="$("$PDFINFOBIN" "$INFILEPDF" 2>/dev/null | grep -i 'Page size:' )"
  236. if [[ -z $identify && $ADAPTIVEMODE = $FALSE ]]; then
  237. echo "Error when reading input file!"
  238. echo "Could not determine the page size!"
  239. echo "Linux PdfInfo returned an empty string!"
  240. echo "Aborting! You may want to try the adaptive mode."
  241. exit 15
  242. elif [[ -z $identify && $ADAPTIVEMODE = $TRUE ]]; then
  243. return $FALSE
  244. fi
  245. identify="${identify##*Page size:}" # remove stuff
  246. identify=($identify) # make it an array
  247. PGWIDTH=$(printf '%.0f' "${identify[0]}") # assign
  248. PGHEIGHT=$(printf '%.0f' "${identify[2]}") # assign
  249. }
  250. # Gets page size using cat and grep
  251. getPageSizeCatGrep() {
  252. # get MediaBox info from PDF file using cat and grep, these are all possible
  253. # /MediaBox [0 0 595 841]
  254. # /MediaBox [ 0 0 595.28 841.89]
  255. # /MediaBox[ 0 0 595.28 841.89 ]
  256. # Get MediaBox data if possible
  257. local mediaBox="$(cat "$INFILEPDF" | grep -a '/MediaBox' | head -n1)"
  258. mediaBox="${mediaBox##*/MediaBox}"
  259. # No page size data available
  260. if [[ -z $mediaBox && $ADAPTIVEMODE = $FALSE ]]; then
  261. echo "Error when reading input file!"
  262. echo "Could not determine the page size!"
  263. echo "There is no MediaBox in the pdf document!"
  264. echo "Aborting! You may want to try the adaptive mode."
  265. exit 15
  266. elif [[ -z $mediaBox && $ADAPTIVEMODE = $TRUE ]]; then
  267. return $FALSE
  268. fi
  269. # remove chars [ and ]
  270. mediaBox="${mediaBox//[}"
  271. mediaBox="${mediaBox//]}"
  272. mediaBox=($mediaBox) # make it an array
  273. mbCount=${#mediaBox[@]} # array size
  274. # sanity
  275. if [[ $mbCount -lt 4 ]]; then
  276. echo "Error when reading the page size!"
  277. echo "The page size information is invalid!"
  278. exit 16
  279. fi
  280. # we are done
  281. PGWIDTH=$(printf '%.0f' "${mediaBox[2]}") # Get Round Width
  282. PGHEIGHT=$(printf '%.0f' "${mediaBox[3]}") # Get Round Height
  283. return $TRUE
  284. }
  285. # Detects operation mode and also runs the adaptive mode
  286. getPageSize() {
  287. if [[ $ADAPTIVEMODE = $FALSE ]]; then
  288. vprint " Adaptive mode: Disabled"
  289. if [[ $MODE = "CATGREP" ]]; then
  290. vprint " Method: Cat + Grep"
  291. getPageSizeCatGrep
  292. elif [[ $MODE = "MDLS" ]]; then
  293. vprint " Method: Mac Quartz mdls"
  294. getPageSizeMdls
  295. elif [[ $MODE = "PDFINFO" ]]; then
  296. vprint " Method: Linux PdfInfo"
  297. getPageSizePdfInfo
  298. elif [[ $MODE = "IDENTIFY" ]]; then
  299. vprint " Method: ImageMagick's Identify"
  300. getPageSizeImagemagick
  301. else
  302. echo "Error! Invalid Mode: $MODE"
  303. echo "Aborting execution..."
  304. exit 20
  305. fi
  306. return $TRUE
  307. fi
  308. vprint " Adaptive mode: Enabled"
  309. vprint " Method: Cat + Grep"
  310. getPageSizeCatGrep
  311. if [[ -z $PGWIDTH && -z $PGHEIGHT ]]; then
  312. vprint " Failed"
  313. if [[ $OSNAME = "Darwin" ]]; then
  314. vprint " Method: Mac Quartz mdls"
  315. getPageSizeMdls
  316. else
  317. vprint " Method: Linux PdfInfo"
  318. getPageSizePdfInfo
  319. fi
  320. fi
  321. if [[ -z $PGWIDTH && -z $PGHEIGHT ]]; then
  322. vprint " Failed"
  323. vprint " Method: ImageMagick's Identify"
  324. getPageSizeImagemagick
  325. fi
  326. if [[ -z $PGWIDTH && -z $PGHEIGHT ]]; then
  327. vprint " Failed"
  328. echo "Error when detecting PDF paper size!"
  329. echo "All methods of detection failed"
  330. echo "You may want to install pdfinfo or imagemagick"
  331. exit 17
  332. fi
  333. }
  334. # Parse options
  335. while getopts ":vhVs:m:" o; do
  336. case "${o}" in
  337. v)
  338. ((VERBOSE++))
  339. ;;
  340. h)
  341. printHelp
  342. exit 0
  343. ;;
  344. V)
  345. printVersion
  346. exit 0
  347. ;;
  348. s)
  349. parseScale ${OPTARG}
  350. ;;
  351. m)
  352. parseMode ${OPTARG}
  353. ;;
  354. *)
  355. usage
  356. ;;
  357. esac
  358. done
  359. shift $((OPTIND-1))
  360. ######### START EXECUTION
  361. #Intro message
  362. vprint "$(basename $0) v$VERSION - Verbose execution"
  363. # Dependencies
  364. vprint "Checking for ghostscript and bcmath"
  365. command -v gs >/dev/null 2>&1 || printDependency 'ghostscript'
  366. command -v bc >/dev/null 2>&1 || printDependency 'bc'
  367. if [[ $MODE = "IDENTIFY" ]]; then
  368. vprint "Checking for imagemagick's identify"
  369. command -v identify >/dev/null 2>&1 || printDependency 'imagemagick'
  370. fi
  371. if [[ $MODE = "PDFINFO" ]]; then
  372. vprint "Checking for pdfinfo"
  373. command -v pdfinfo >/dev/null 2>&1 || printDependency 'pdfinfo'
  374. fi
  375. # Get dependency binaries
  376. GSBIN="$(which gs 2>/dev/null)"
  377. BCBIN="$(which bc 2>/dev/null)"
  378. IDBIN=$(which identify 2>/dev/null)
  379. if [[ $OSNAME = "Darwin" ]]; then
  380. MDLSBIN="$(which mdls 2>/dev/null)"
  381. else
  382. PDFINFOBIN="$(which pdfinfo 2>/dev/null)"
  383. fi
  384. # Verbose scale info
  385. vprint " Scale factor: $SCALE"
  386. # Validate args
  387. [[ $# -lt 1 ]] && { usage; exit 1; }
  388. INFILEPDF="$1"
  389. [[ "$INFILEPDF" =~ ^..*\.pdf$ ]] || { usage; exit 2; }
  390. [[ -f "$INFILEPDF" ]] || { echo "Error! File not found: $INFILEPDF"; exit 3; }
  391. vprint " Input file: $INFILEPDF"
  392. # Parse output filename
  393. if [[ -z $2 ]]; then
  394. OUTFILEPDF="${INFILEPDF%.pdf}.SCALED.pdf"
  395. else
  396. OUTFILEPDF="${2%.pdf}.pdf"
  397. fi
  398. vprint " Output file: $OUTFILEPDF"
  399. getPageSize
  400. vprint " Width: $PGWIDTH postscript-points"
  401. vprint " Height: $PGHEIGHT postscript-points"
  402. # Compute translation factors (to center page.
  403. XTRANS=$(echo "scale=6; 0.5*(1.0-$SCALE)/$SCALE*$PGWIDTH" | "$BCBIN")
  404. YTRANS=$(echo "scale=6; 0.5*(1.0-$SCALE)/$SCALE*$PGHEIGHT" | "$BCBIN")
  405. vprint " Translation X: $XTRANS"
  406. vprint " Translation Y: $YTRANS"
  407. # Do it.
  408. "$GSBIN" \
  409. -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -dSAFER \
  410. -dCompatibilityLevel="1.5" -dPDFSETTINGS="/printer" \
  411. -dColorConversionStrategy=/LeaveColorUnchanged \
  412. -dSubsetFonts=true -dEmbedAllFonts=true \
  413. -dDEVICEWIDTH=$PGWIDTH -dDEVICEHEIGHT=$PGHEIGHT \
  414. -sOutputFile="$OUTFILEPDF" \
  415. -c "<</BeginPage{$SCALE $SCALE scale $XTRANS $YTRANS translate}>> setpagedevice" \
  416. -f "$INFILEPDF"