You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

496 lines
17 KiB

10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
9 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
10 years ago
  1. #!/usr/bin/env python2
  2. # -*- coding: utf-8 -*-
  3. """Python module to download videos.
  4. This module contains the actual downloaders responsible
  5. for downloading the video files.
  6. """
  7. from __future__ import unicode_literals
  8. import re
  9. import os
  10. import sys
  11. import locale
  12. import signal
  13. import subprocess
  14. from time import sleep
  15. from Queue import Queue
  16. from threading import Thread
  17. from .utils import convert_item
  18. class PipeReader(Thread):
  19. """Helper class to avoid deadlocks when reading from subprocess pipes.
  20. This class uses python threads and queues in order to read from subprocess
  21. pipes in an asynchronous way.
  22. Attributes:
  23. WAIT_TIME (float): Time in seconds to sleep.
  24. Args:
  25. queue (Queue.Queue): Python queue to store the output of the subprocess.
  26. Warnings:
  27. All the operations are based on 'str' types. The caller has to convert
  28. the queued items back to 'unicode' if he needs to.
  29. """
  30. WAIT_TIME = 0.1
  31. def __init__(self, queue):
  32. super(PipeReader, self).__init__()
  33. self._filedescriptor = None
  34. self._running = True
  35. self._queue = queue
  36. self.start()
  37. def run(self):
  38. # Flag to ignore specific lines
  39. ignore_line = False
  40. while self._running:
  41. if self._filedescriptor is not None:
  42. for line in iter(self._filedescriptor.readline, str('')):
  43. # Ignore ffmpeg stderr
  44. if str('ffmpeg version') in line:
  45. ignore_line = True
  46. if not ignore_line:
  47. self._queue.put_nowait(line)
  48. self._filedescriptor = None
  49. ignore_line = False
  50. sleep(self.WAIT_TIME)
  51. def attach_filedescriptor(self, filedesc):
  52. """Attach a filedescriptor to the PipeReader. """
  53. self._filedescriptor = filedesc
  54. def join(self, timeout=None):
  55. self._running = False
  56. super(PipeReader, self).join(timeout)
  57. class YoutubeDLDownloader(object):
  58. """Python class for downloading videos using youtube-dl & subprocess.
  59. Attributes:
  60. OK, ERROR, STOPPED, ALREADY, FILESIZE_ABORT, WARNING (int): Integers
  61. that describe the return code from the download() method. The
  62. larger the number the higher is the hierarchy of the code.
  63. Codes with smaller hierachy cannot overwrite codes with higher
  64. hierarchy.
  65. Args:
  66. youtubedl_path (string): Absolute path to youtube-dl binary.
  67. data_hook (function): Optional callback function to retrieve download
  68. process data.
  69. log_data (function): Optional callback function to write data to
  70. the log file.
  71. Warnings:
  72. The caller is responsible for calling the close() method after he has
  73. finished with the object in order for the object to be able to properly
  74. close down itself.
  75. Example:
  76. How to use YoutubeDLDownloader from a python script.
  77. from downloaders import YoutubeDLDownloader
  78. def data_hook(data):
  79. print data
  80. downloader = YoutubeDLDownloader('/usr/bin/youtube-dl', data_hook)
  81. downloader.download(<URL STRING>, ['-f', 'flv'])
  82. """
  83. OK = 0
  84. WARNING = 1
  85. ERROR = 2
  86. FILESIZE_ABORT = 3
  87. ALREADY = 4
  88. STOPPED = 5
  89. def __init__(self, youtubedl_path, data_hook=None, log_data=None):
  90. self.youtubedl_path = youtubedl_path
  91. self.data_hook = data_hook
  92. self.log_data = log_data
  93. self._return_code = self.OK
  94. self._proc = None
  95. self._stderr_queue = Queue()
  96. self._stderr_reader = PipeReader(self._stderr_queue)
  97. def download(self, url, options):
  98. """Download url using given options.
  99. Args:
  100. url (string): URL string to download.
  101. options (list): Python list that contains youtube-dl options.
  102. Returns:
  103. An integer that shows the status of the download process.
  104. There are 6 different return codes.
  105. OK (0): The download process completed successfully.
  106. WARNING (1): A warning occured during the download process.
  107. ERROR (2): An error occured during the download process.
  108. FILESIZE_ABORT (3): The corresponding url video file was larger or
  109. smaller from the given filesize limit.
  110. ALREADY (4): The given url is already downloaded.
  111. STOPPED (5): The download process was stopped by the user.
  112. """
  113. self._return_code = self.OK
  114. cmd = self._get_cmd(url, options)
  115. self._create_process(cmd)
  116. if self._proc is not None:
  117. self._stderr_reader.attach_filedescriptor(self._proc.stderr)
  118. while self._proc_is_alive():
  119. stdout = self._proc.stdout.readline().rstrip()
  120. stdout = convert_item(stdout, to_unicode=True)
  121. if stdout:
  122. data_dict = extract_data(stdout)
  123. self._extract_info(data_dict)
  124. self._hook_data(data_dict)
  125. # Read stderr after download process has been completed
  126. # We don't need to read stderr in real time
  127. while not self._stderr_queue.empty():
  128. stderr = self._stderr_queue.get_nowait().rstrip()
  129. stderr = convert_item(stderr, to_unicode=True)
  130. self._log(stderr)
  131. if self._is_warning(stderr):
  132. self._set_returncode(self.WARNING)
  133. else:
  134. self._set_returncode(self.ERROR)
  135. # Set return code to ERROR if we could not start the download process
  136. # or the childs return code is greater than zero
  137. # NOTE: In Linux if the called script is just empty Python exits
  138. # normally (ret=0), so we cant detect this or similar cases
  139. # using the code below
  140. # NOTE: In Unix a negative return code (-N) indicates that the child
  141. # was terminated by signal N (e.g. -9 = SIGKILL)
  142. if self._proc is None or self._proc.returncode > 0:
  143. self._return_code = self.ERROR
  144. if self._proc is not None and self._proc.returncode > 0:
  145. self._log('Child process exited with non-zero code: {}'.format(self._proc.returncode))
  146. self._last_data_hook()
  147. return self._return_code
  148. def stop(self):
  149. """Stop the download process and set return code to STOPPED. """
  150. if self._proc_is_alive():
  151. if os.name == 'nt':
  152. # os.killpg is not available on Windows
  153. # See: https://bugs.python.org/issue5115
  154. self._proc.kill()
  155. # When we kill the child process on Windows the return code
  156. # gets set to 1, so we want to reset the return code back to 0
  157. # in order to avoid creating logging output in the download(...)
  158. # method
  159. self._proc.returncode = 0
  160. else:
  161. os.killpg(self._proc.pid, signal.SIGKILL)
  162. self._set_returncode(self.STOPPED)
  163. def close(self):
  164. """Destructor like function for the object. """
  165. self._stderr_reader.join()
  166. def _set_returncode(self, code):
  167. """Set self._return_code only if the hierarchy of the given code is
  168. higher than the current self._return_code. """
  169. if code >= self._return_code:
  170. self._return_code = code
  171. def _is_warning(self, stderr):
  172. return stderr.split(':')[0] == 'WARNING'
  173. def _last_data_hook(self):
  174. """Set the last data information based on the return code. """
  175. data_dictionary = {}
  176. if self._return_code == self.OK:
  177. data_dictionary['status'] = 'Finished'
  178. elif self._return_code == self.ERROR:
  179. data_dictionary['status'] = 'Error'
  180. data_dictionary['speed'] = ''
  181. data_dictionary['eta'] = ''
  182. elif self._return_code == self.WARNING:
  183. data_dictionary['status'] = 'Warning'
  184. data_dictionary['speed'] = ''
  185. data_dictionary['eta'] = ''
  186. elif self._return_code == self.STOPPED:
  187. data_dictionary['status'] = 'Stopped'
  188. data_dictionary['speed'] = ''
  189. data_dictionary['eta'] = ''
  190. elif self._return_code == self.ALREADY:
  191. data_dictionary['status'] = 'Already Downloaded'
  192. else:
  193. data_dictionary['status'] = 'Filesize Abort'
  194. self._hook_data(data_dictionary)
  195. def _extract_info(self, data):
  196. """Extract informations about the download process from the given data.
  197. Args:
  198. data (dict): Python dictionary that contains different
  199. keys. The keys are not standar the dictionary can also be
  200. empty when there are no data to extract. See extract_data().
  201. """
  202. if 'status' in data:
  203. if data['status'] == 'Already Downloaded':
  204. # Set self._return_code to already downloaded
  205. # and trash that key
  206. self._set_returncode(self.ALREADY)
  207. data['status'] = None
  208. if data['status'] == 'Filesize Abort':
  209. # Set self._return_code to filesize abort
  210. # and trash that key
  211. self._set_returncode(self.FILESIZE_ABORT)
  212. data['status'] = None
  213. def _log(self, data):
  214. """Log data using the callback function. """
  215. if self.log_data is not None:
  216. self.log_data(data)
  217. def _hook_data(self, data):
  218. """Pass data back to the caller. """
  219. if self.data_hook is not None:
  220. self.data_hook(data)
  221. def _proc_is_alive(self):
  222. """Returns True if self._proc is alive else False. """
  223. if self._proc is None:
  224. return False
  225. return self._proc.poll() is None
  226. def _get_cmd(self, url, options):
  227. """Build the subprocess command.
  228. Args:
  229. url (string): URL string to download.
  230. options (list): Python list that contains youtube-dl options.
  231. Returns:
  232. Python list that contains the command to execute.
  233. """
  234. if os.name == 'nt':
  235. cmd = [self.youtubedl_path] + options + [url]
  236. else:
  237. cmd = ['python', self.youtubedl_path] + options + [url]
  238. return cmd
  239. def _create_process(self, cmd):
  240. """Create new subprocess.
  241. Args:
  242. cmd (list): Python list that contains the command to execute.
  243. """
  244. info = preexec = None
  245. # Keep a unicode copy of cmd for the log
  246. ucmd = cmd
  247. if os.name == 'nt':
  248. # Hide subprocess window
  249. info = subprocess.STARTUPINFO()
  250. info.dwFlags |= subprocess.STARTF_USESHOWWINDOW
  251. else:
  252. # Make subprocess the process group leader
  253. # in order to kill the whole process group with os.killpg
  254. preexec = os.setsid
  255. # Encode command for subprocess
  256. # Refer to http://stackoverflow.com/a/9951851/35070
  257. if sys.version_info < (3, 0):
  258. cmd = convert_item(cmd, to_unicode=False)
  259. try:
  260. self._proc = subprocess.Popen(cmd,
  261. stdout=subprocess.PIPE,
  262. stderr=subprocess.PIPE,
  263. preexec_fn=preexec,
  264. startupinfo=info)
  265. except (ValueError, OSError) as error:
  266. self._log('Failed to start process: {}'.format(ucmd))
  267. self._log(convert_item(str(error), to_unicode=True))
  268. def extract_data(stdout):
  269. """Extract data from youtube-dl stdout.
  270. Args:
  271. stdout (string): String that contains the youtube-dl stdout.
  272. Returns:
  273. Python dictionary. The returned dictionary can be empty if there are
  274. no data to extract else it may contain one or more of the
  275. following keys:
  276. 'status' : Contains the status of the download process.
  277. 'path' : Destination path.
  278. 'extension' : The file extension.
  279. 'filename' : The filename without the extension.
  280. 'percent' : The percentage of the video being downloaded.
  281. 'eta' : Estimated time for the completion of the download process.
  282. 'speed' : Download speed.
  283. 'filesize' : The size of the video file being downloaded.
  284. 'playlist_index' : The playlist index of the current video file being downloaded.
  285. 'playlist_size' : The number of videos in the playlist.
  286. """
  287. # REFACTOR
  288. def extract_filename(input_data):
  289. path, fullname = os.path.split(input_data.strip("\""))
  290. filename, extension = os.path.splitext(fullname)
  291. return path, filename, extension
  292. data_dictionary = {}
  293. if not stdout:
  294. return data_dictionary
  295. # We want to keep the spaces in order to extract filenames with
  296. # multiple whitespaces correctly. We also keep a copy of the old
  297. # 'stdout' for backward compatibility with the old code
  298. stdout_with_spaces = stdout.split(' ')
  299. stdout = stdout.split()
  300. stdout[0] = stdout[0].lstrip('\r')
  301. if stdout[0] == '[download]':
  302. data_dictionary['status'] = 'Downloading'
  303. # Get path, filename & extension
  304. if stdout[1] == 'Destination:':
  305. path, filename, extension = extract_filename(' '.join(stdout_with_spaces[2:]))
  306. data_dictionary['path'] = path
  307. data_dictionary['filename'] = filename
  308. data_dictionary['extension'] = extension
  309. # Get progress info
  310. if '%' in stdout[1]:
  311. if stdout[1] == '100%':
  312. data_dictionary['speed'] = ''
  313. data_dictionary['eta'] = ''
  314. data_dictionary['percent'] = '100%'
  315. data_dictionary['filesize'] = stdout[3]
  316. else:
  317. data_dictionary['percent'] = stdout[1]
  318. data_dictionary['filesize'] = stdout[3]
  319. data_dictionary['speed'] = stdout[5]
  320. data_dictionary['eta'] = stdout[7]
  321. # Get playlist info
  322. if stdout[1] == 'Downloading' and stdout[2] == 'video':
  323. data_dictionary['playlist_index'] = stdout[3]
  324. data_dictionary['playlist_size'] = stdout[5]
  325. # Remove the 'and merged' part from stdout when using ffmpeg to merge the formats
  326. if stdout[-3] == 'downloaded' and stdout [-1] == 'merged':
  327. stdout = stdout[:-2]
  328. stdout_with_spaces = stdout_with_spaces[:-2]
  329. data_dictionary['percent'] = '100%'
  330. # Get file already downloaded status
  331. if stdout[-1] == 'downloaded':
  332. data_dictionary['status'] = 'Already Downloaded'
  333. path, filename, extension = extract_filename(' '.join(stdout_with_spaces[1:-4]))
  334. data_dictionary['path'] = path
  335. data_dictionary['filename'] = filename
  336. data_dictionary['extension'] = extension
  337. # Get filesize abort status
  338. if stdout[-1] == 'Aborting.':
  339. data_dictionary['status'] = 'Filesize Abort'
  340. elif stdout[0] == '[hlsnative]':
  341. # native hls extractor
  342. # see: https://github.com/rg3/youtube-dl/blob/master/youtube_dl/downloader/hls.py#L54
  343. data_dictionary['status'] = 'Downloading'
  344. if len(stdout) == 7:
  345. segment_no = float(stdout[6])
  346. current_segment = float(stdout[4])
  347. # Get the percentage
  348. percent = '{0:.1f}%'.format(current_segment / segment_no * 100)
  349. data_dictionary['percent'] = percent
  350. elif stdout[0] == '[ffmpeg]':
  351. data_dictionary['status'] = 'Post Processing'
  352. # Get final extension after merging process
  353. if stdout[1] == 'Merging':
  354. path, filename, extension = extract_filename(' '.join(stdout_with_spaces[4:]))
  355. data_dictionary['path'] = path
  356. data_dictionary['filename'] = filename
  357. data_dictionary['extension'] = extension
  358. # Get final extension ffmpeg post process simple (not file merge)
  359. if stdout[1] == 'Destination:':
  360. path, filename, extension = extract_filename(' '.join(stdout_with_spaces[2:]))
  361. data_dictionary['path'] = path
  362. data_dictionary['filename'] = filename
  363. data_dictionary['extension'] = extension
  364. # Get final extension after recoding process
  365. if stdout[1] == 'Converting':
  366. path, filename, extension = extract_filename(' '.join(stdout_with_spaces[8:]))
  367. data_dictionary['path'] = path
  368. data_dictionary['filename'] = filename
  369. data_dictionary['extension'] = extension
  370. elif stdout[0][0] != '[' or stdout[0] == '[debug]':
  371. pass # Just ignore this output
  372. else:
  373. data_dictionary['status'] = 'Pre Processing'
  374. return data_dictionary